In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
csv_path = os.path.join('..','datasets','raw','data.csv')
df = pd.read_csv(csv_path)
df.sample(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
4830,Suzuki,Forenza,2008,regular unleaded,127.0,4.0,MANUAL,front wheel drive,4.0,,Compact,Sedan,28,20,481,14489
11506,BMW,X4,2017,premium unleaded (required),355.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Factory Tuner,Luxury,High-Performance",Midsize,4dr SUV,26,19,3916,58100
3471,Dodge,Dakota,2010,regular unleaded,210.0,6.0,AUTOMATIC,rear wheel drive,4.0,Flex Fuel,Compact,Crew Cab Pickup,20,15,1851,25305
8995,Hyundai,Santa Fe,2017,regular unleaded,290.0,6.0,AUTOMATIC,all wheel drive,4.0,Crossover,Midsize,4dr SUV,22,17,1439,36700
344,Nissan,370Z,2017,premium unleaded (required),332.0,6.0,MANUAL,rear wheel drive,2.0,High-Performance,Compact,Coupe,26,18,2009,29990
9,BMW,1 Series,2013,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,27,18,3916,37200
10803,Mazda,Tribute,2009,regular unleaded,171.0,4.0,AUTOMATIC,front wheel drive,4.0,Crossover,Compact,4dr SUV,28,20,586,24620
10435,Nissan,Titan,2014,regular unleaded,317.0,8.0,AUTOMATIC,four wheel drive,4.0,Flex Fuel,Large,Crew Cab Pickup,17,12,2009,34760
10657,Ford,Transit Connect,2015,regular unleaded,169.0,4.0,AUTOMATIC,front wheel drive,4.0,,Compact,Cargo Minivan,29,21,5657,23330
11163,Aston Martin,V8 Vantage,2015,premium unleaded (required),430.0,8.0,AUTOMATED_MANUAL,rear wheel drive,2.0,"Exotic,High-Performance",Compact,Convertible,21,14,259,138195


In [3]:
len(df)

11914

In [4]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [5]:
my_variables = ['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']
cars = df[my_variables].copy()

# Lowercase the column names and replace spaces with underscores
cars.columns = cars.columns.str.lower().str.replace(' ','_')
categorical = list(cars.dtypes[cars.dtypes == 'object'].index)

for feature in categorical:
    cars[feature] = cars[feature].str.lower().str.replace(' ', '_')
    
cars.sample(5)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
1402,nissan,altima,2015,182.0,4.0,automatic,sedan,38,27,22560
9232,toyota,sienna,2016,266.0,6.0,automatic,passenger_minivan,25,18,37655
804,saab,9-3_griffin,2012,220.0,4.0,manual,sedan,33,20,28900
8007,volkswagen,rabbit,2008,170.0,5.0,manual,4dr_hatchback,29,22,17575
7054,nissan,murano,2017,260.0,6.0,automatic,4dr_suv,28,21,39180


In [6]:
# View missing values
cars.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [7]:
# FIll the missing values with 0
cars[['engine_hp','engine_cylinders']] = cars[['engine_hp','engine_cylinders']].fillna(value=0)

In [8]:
msrp_mean = cars['msrp'].mean()
# Binarizing msrp
cars['above_average'] = (cars['msrp'] <= msrp_mean).astype(int)
cars['above_average']

0        0
1        0
2        1
3        1
4        1
        ..
11909    0
11910    0
11911    0
11912    0
11913    1
Name: above_average, Length: 11914, dtype: int64

In [12]:
cars.nunique()

make                   48
model                 914
year                   28
engine_hp             357
engine_cylinders        9
transmission_type       5
vehicle_style          16
highway_mpg            59
city_mpg               69
msrp                 6049
above_average           2
dtype: int64

## Feature Importance

### Mutual Information

In [None]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mut

In [None]:
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
y_pred

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

prob = (y_pred >= .5).astype(int)
acc = accuracy_score(y_val, prob).round(1)
prec = precision_score(y_val, prob).round(1)
rec = recall_score(y_val, prob).round(1)

print(acc)
print(prec)
print(rec)


In [None]:
thresholds = np.linspace(0, 1, 101)

accuracies = []
for t in thresholds:
    prob = y_pred >= t
    acc = accuracy_score(y_val, prob >= t)
    accuracies.append(acc)
    

plt.plot(thresholds, accuracies)

### Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score, precision_recall_curve

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred)

def plot_precision_recall_vs_thresholds(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recalls")


plot_precision_recall_vs_thresholds(precisions, recalls, thresholds)
plt.show()

In [None]:

plt.plot(thresholds, precisions[:-1], "b--", label="Precision")

In [None]:
plt.plot(recalls, precisions, "g-")
plt.title("Precision vs Recall")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()