In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.feature_extraction import DictVectorizer
%matplotlib inline

In [62]:
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [63]:
df = df[[
    'Make',
    'Model',
    'Year',
    'Engine HP',
    'Engine Cylinders',
    'Transmission Type',
    'Vehicle Style',
    'highway MPG',
    'city mpg',
    'MSRP'
]]

In [64]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.fillna(0, inplace=True)
df.columns = [*df.columns[:-1], 'price']

In [65]:
# Question 1
df.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

In [66]:
# Question 2
df.corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [67]:
above_average = (df.price > df.price.mean()).astype('int')
above_average

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: price, Length: 11914, dtype: int64

In [68]:
price = df.price
df.drop('price', axis=1, inplace=True)

In [69]:
df_full_train, df_test, y_full_train, y_test = train_test_split(df, above_average, test_size=0.2, random_state=42)
df_train, df_val, y_train, y_val = train_test_split(df_full_train, y_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train, y_val, y_test = y_train.values, y_val.values, y_test.values

In [70]:
# Question 3
for col in ['make','model','transmission_type','vehicle_style']:
  score = mutual_info_score(y_train, df_train[col])
  print(col, ' -> ', round(score, 2))

make  ->  0.24
model  ->  0.46
transmission_type  ->  0.02
vehicle_style  ->  0.08


In [73]:
# Question 4
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
categorical = ['make','model','transmission_type','vehicle_style']

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1] >= 0.5
org_score = accuracy_score(y_val, y_pred)
round(org_score, 2)

0.95

In [76]:
# Question 5 (city_mpg)

for col in df.columns:
  dv = DictVectorizer(sparse=False)

  train_dict = df_train[categorical + numerical].drop(col, axis=1).to_dict(orient='records')
  X_train = dv.fit_transform(train_dict)

  val_dict = df_val[categorical + numerical].drop(col, axis=1).to_dict(orient='records')
  X_val = dv.transform(val_dict)

  model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict_proba(X_val)[:, 1] >= 0.5
  score = accuracy_score(y_val, y_pred)
  print(col, " -> ", (org_score - score))


make  ->  -0.0012589173310952884
model  ->  0.026017624842635367
year  ->  -0.0025178346621905767
engine_hp  ->  0.020981955518254325
engine_cylinders  ->  -0.0012589173310952884
transmission_type  ->  0.0008392782207301552
vehicle_style  ->  0.0025178346621904657
highway_mpg  ->  -0.0012589173310952884
city_mpg  ->  -0.00041963911036513313


In [81]:
# Question 6
y = np.log1p(price)
df_full_train, df_test, y_full_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
df_train, df_val, y_train, y_val = train_test_split(df_full_train, y_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train, y_val, y_test = y_train.values, y_val.values, y_test.values
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
for a in [0, 0.01, 0.1, 1, 10]:
  model = Ridge(solver='sag', alpha=a, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)
  score = mean_squared_error(y_val, y_pred, squared=False)
  print(a, ' -> ', round(score, 3))

0  ->  0.494
0.01  ->  0.494
0.1  ->  0.494
1  ->  0.494
10  ->  0.494
