# Module 03 - MLZoomcamp 2023

In [66]:
data = 'wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [67]:
!wget $data

--2023-10-02 17:29:34--  http://wget/
Resolving wget (wget)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘wget’
--2023-10-02 17:29:34--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv.1’


2023-10-02 17:29:35 (5.03 MB/s) - ‘data.csv.1’ saved [1475504/1475504]

FINISHED --2023-10-02 17:29:35--
Total wall clock time: 0.6s
Downloaded: 1 files, 1.4M in 0.3s (5.03 MB/s)


In [68]:
from IPython.display import display
from matplotlib.style import use
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

use('seaborn-v0_8')
%matplotlib inline

## 3.2 Data  Preparation

In [69]:
df = pd.read_csv('data.csv')

In [70]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [71]:
df = df[['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']]

In [72]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [73]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [74]:
df = df.fillna(0)

In [75]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [76]:
df['price'] = df.msrp
del df['msrp']

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   price              11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


## Question 1

In [78]:
df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

## Question 2

In [79]:
df[['year', 'engine_cylinders']].corrwith(df.engine_hp).abs()


year                0.338714
engine_cylinders    0.774851
dtype: float64

In [80]:
df[['engine_cylinders', 'city_mpg']].corrwith(df.highway_mpg).abs()


engine_cylinders    0.614541
city_mpg            0.886829
dtype: float64

In [81]:
df.price.mean()

40594.737032063116

In [82]:
average = df.price.mean()
above_average = []

for value in df.price:
    if value > average:
        above_average.append(1)
    else:
        above_average.append(0)

In [83]:
df['above_average'] = above_average

In [84]:
del df['price']

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   above_average      11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [86]:
df_train, df_rest = train_test_split(df, test_size=0.40, random_state=42)
df_val, df_test = train_test_split(df_rest, test_size=0.50, random_state=42)

print(f'Training dataframe shape: {df_train.shape}')
print(f'Validation dataframe shape: {df_val.shape}')
print(f'Testing dataframe shape: {df_test.shape}')

Training dataframe shape: (7148, 10)
Validation dataframe shape: (2383, 10)
Testing dataframe shape: (2383, 10)


In [87]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [88]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [89]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

## Question 3

In [90]:
df_train

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,Nissan,Frontier,2015,261.0,6.0,AUTOMATIC,Crew Cab Pickup,21,15
1,FIAT,500L,2016,160.0,4.0,MANUAL,Wagon,33,25
2,Ford,Ranger,2011,207.0,6.0,MANUAL,Extended Cab Pickup,21,16
3,Chevrolet,S-10,2003,120.0,4.0,MANUAL,Extended Cab Pickup,25,19
4,Mitsubishi,Outlander Sport,2016,168.0,4.0,AUTOMATIC,4dr SUV,27,22
...,...,...,...,...,...,...,...,...,...
7143,Toyota,Venza,2014,181.0,4.0,AUTOMATIC,Wagon,26,20
7144,Pontiac,G6,2009,219.0,6.0,AUTOMATIC,Sedan,26,17
7145,Volkswagen,Golf GTI,2016,220.0,4.0,AUTOMATED_MANUAL,2dr Hatchback,33,25
7146,Saab,9-5,2009,260.0,4.0,AUTOMATIC,Wagon,27,17


In [91]:
mutual_info_score(y_train, df_train.make)

0.23528465531020354

In [92]:
mutual_info_score(y_train, df_train.model)

0.4636231369513445

In [93]:
mutual_info_score(y_train, df_train.transmission_type)

0.021197379052288075

In [94]:
mutual_info_score(y_train, df_train.vehicle_style)

0.08295755944425208

## Question 4


In [95]:
dv = DictVectorizer(sparse=False)

In [96]:
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [97]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [98]:
round(model.score(X_val, y_val), 2)

0.94