In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [90]:
df = pd.read_csv("data.csv")
df = df[['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']]
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.fillna(0)
df = df.rename(columns={"msrp": "price"})
df.describe()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
count,11914.0,11914.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,247.941749,5.614655,26.637485,19.733255,40594.74
std,7.57974,110.507669,1.800554,8.863001,8.987798,60109.1
min,1990.0,0.0,0.0,12.0,7.0,2000.0
25%,2007.0,170.0,4.0,22.0,16.0,21000.0
50%,2015.0,225.0,6.0,26.0,18.0,29995.0
75%,2016.0,300.0,6.0,30.0,22.0,42231.25
max,2017.0,1001.0,16.0,354.0,137.0,2065902.0


# Question 1

In [91]:
df['transmission_type'].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

# Question 2

In [92]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

In [93]:
df[['year', 'engine_hp', 'engine_cylinders','highway_mpg', 'city_mpg','price']].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


# Question 3

In [94]:
mean = df['price'].mean()
df['above_average'] = (df['price'] > mean).astype(int)

In [112]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)


In [113]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, df_train_full.above_average)

df_mi = df_train_full[['make', 'model', 'transmission_type', 'vehicle_style']].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

df_mi

Unnamed: 0,MI
model,0.460994
make,0.238724
vehicle_style,0.08339
transmission_type,0.020884


# Question 4

In [114]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values

del df_train['above_average']
del df_val['above_average']

del df_train['price']
del df_val['price']

In [115]:
from sklearn.feature_extraction import DictVectorizer
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
X_train.shape


(7148, 943)

In [116]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)


In [117]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [118]:
y_pred = model.predict_proba(X_val)[:, 1]

pred = y_pred > 0.5
(y_val == pred).mean()


0.9458665547629039

# Question 5

In [102]:
subset = ['year','engine_hp','transmission_type','city_mpg']
train_dict_small = df_train[subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

model_small = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model_small.fit(X_small_train, y_train)

val_dict_small = df_val[subset].to_dict(orient='records')
X_small_val = dv_small.transform(val_dict_small)
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]
pred_small = y_pred_small > 0.5
acc = (y_val == pred_small).mean()

In [103]:
feats = ['year','engine_hp','transmission_type','city_mpg']

for feat in feats:
    subset = ['year','engine_hp','transmission_type','city_mpg']
    subset.remove(feat) 
    train_dict_small = df_train[subset].to_dict(orient='records')
    dv_small = DictVectorizer(sparse=False)
    dv_small.fit(train_dict_small)

    X_small_train = dv_small.transform(train_dict_small)

    model_small = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model_small.fit(X_small_train, y_train)

    val_dict_small = df_val[subset].to_dict(orient='records')
    X_small_val = dv_small.transform(val_dict_small)
    y_pred_small = model_small.predict_proba(X_small_val)[:, 1]
    pred_small = y_pred_small > 0.5
    print(feat + ': ' + str(acc - (y_val == pred_small).mean()))

year: -0.00041963911036513313
engine_hp: 0.14057910197230383
transmission_type: 0.002937473772555599
city_mpg: 0.008392782207301663


# Question 6

In [123]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = np.log(df_train.price.values)
y_val = np.log(df_val.price.values)

del df_train['above_average']
del df_val['above_average']

del df_train['price']
del df_val['price']

In [124]:
from sklearn.feature_extraction import DictVectorizer
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
X_train.shape

(7148, 943)

In [129]:
from sklearn.linear_model import Ridge

model = Ridge(solver='sag', random_state = 42,  max_iter=1000)
model.fit(X_train,y_train)



In [131]:
from sklearn.metrics import mean_squared_error


val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)


0.48688379302159196

In [132]:
alphas = [0, 0.01,0.1,1,10]

for alpha in alphas:
    model = Ridge(alpha = alpha, solver='sag', random_state = 42,  max_iter=1000)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(round(rmse,3))



0.487




0.487




0.487




0.487
0.487


