In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Dataset

In [70]:
df = pd.read_csv('car-price.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


# Features

In [71]:
base = [
    'Make',
    'Model',
    'Year',
    'Engine HP',
    'Engine Cylinders',
    'Transmission Type',
    'Vehicle Style',
    'highway MPG',
    'city mpg'
]

In [72]:
df[base].head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18


# Data Preparation

In [73]:
df = df[base + ['MSRP']]
df.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [74]:
df = df.rename(columns={'MSRP': 'price'})
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [75]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64

In [76]:
df = df.fillna(0)
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

# Question 1

In [77]:
df['transmission_type'].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

# Question 2

In [78]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [79]:
numerical = [elem for elem in df.columns.to_list() if elem not in categorical]
numerical.remove('price')
numerical

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

Correlation matrix created manually:

In [80]:
corr_matrix = pd.DataFrame(columns=numerical)
for feature in numerical:
    corr_matrix.loc[feature] = df[numerical].corrwith(df[feature])
corr_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


Correlation matrix created using `corr()` function:

In [81]:
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


# Binary Target

In [82]:
mean_price = df['price'].mean()
print(mean_price)
df['above_average'] = (df['price'] > mean_price).astype(int)
df.head()

40594.737032063116


Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


# Data Splitting

In [83]:
from sklearn.model_selection import train_test_split

In [84]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [85]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [86]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

price_train = df_train['price'].values
price_val = df_val['price'].values
price_test = df_test['price'].values

del df_train['price']
del df_val['price']
del df_test['price']

In [87]:
df_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15,0
1,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17,0
2,Lamborghini,Gallardo,2012,570.0,10.0,MANUAL,Convertible,20,12,1
3,Chevrolet,Colorado,2016,200.0,4.0,AUTOMATIC,Crew Cab Pickup,27,20,0
4,Pontiac,Vibe,2009,158.0,4.0,AUTOMATIC,4dr Hatchback,26,20,0


# Question 3

In [88]:
from sklearn.metrics import mutual_info_score

In [89]:
def mutual_info_target_score(series):
    return mutual_info_score(series, df_train['above_average'])

In [90]:
mi = df_train[categorical].apply(mutual_info_target_score)
mi.sort_values().round(2)

transmission_type    0.02
vehicle_style        0.08
make                 0.24
model                0.46
dtype: float64

# Question 4

In [91]:
from sklearn.feature_extraction import DictVectorizer

In [92]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [93]:
y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

In [94]:
from sklearn.linear_model import LogisticRegression

In [95]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [96]:
y_pred = model.predict_proba(X_val)[:, 1]
above_average_decision = (y_pred >= 0.5)

In [97]:
original_accuracy = (y_val == above_average_decision).mean()
original_accuracy

0.946286193873269

# Question 5

In [98]:
features = categorical + numerical
accuracy_table = pd.DataFrame(columns=['accuracy_diff'])
for feature in features:
    reduced_features = features.copy()
    reduced_features.remove(feature)

    train_dict = df_train[reduced_features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val[reduced_features].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    above_average_decision = (y_pred >= 0.5)
    accuracy = (y_val == above_average_decision).mean()
    accuracy_diff = original_accuracy - accuracy
    
    accuracy_table.loc[feature] = accuracy_diff

In [99]:
accuracy_table

Unnamed: 0,accuracy_diff
make,-0.002937
model,0.022241
transmission_type,0.001259
vehicle_style,0.003777
year,-0.001679
engine_hp,0.015946
engine_cylinders,-0.000839
highway_mpg,0.002098
city_mpg,0.00042


In [100]:
accuracy_table.abs().sort_values(by='accuracy_diff')

Unnamed: 0,accuracy_diff
city_mpg,0.00042
engine_cylinders,0.000839
transmission_type,0.001259
year,0.001679
highway_mpg,0.002098
make,0.002937
vehicle_style,0.003777
engine_hp,0.015946
model,0.022241


# Question 6

In [101]:
y_price_log_train = np.log1p(price_train)
y_price_log_val = np.log1p(price_val)
y_price_log_test = np.log1p(price_test)

In [102]:
dv = DictVectorizer(sparse=True)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [103]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [104]:
from sklearn.linear_model import Ridge

In [105]:
alphas = [0, 0.01, 0.1, 1, 10]
alpha_score_table = pd.DataFrame(columns=['rmse'])

for alpha in alphas:
    model = Ridge(solver='sag', alpha=alpha, random_state=42)
    model.fit(X_train, y_price_log_train)

    y_pred = model.predict(X_val)
    score = rmse(y_price_log_val, y_pred)
    
    alpha_score_table.loc[alpha] = score

In [106]:
alpha_score_table

Unnamed: 0,rmse
0.0,0.254934
0.01,0.25494
0.1,0.255031
1.0,0.258318
10.0,0.331021


In [107]:
alpha_score_table.sort_values(by='rmse')

Unnamed: 0,rmse
0.0,0.254934
0.01,0.25494
0.1,0.255031
1.0,0.258318
10.0,0.331021
