In [334]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [335]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [336]:
!python -m wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv


Saved under data (2).csv


In [337]:
df = pd.read_csv('data.csv')

In [338]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [339]:
df = df.drop(columns=['engine_fuel_type', 'driven_wheels', 'number_of_doors', 'market_category', 'vehicle_size','popularity'])

In [340]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)

In [341]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
msrp                   int64
dtype: object

In [342]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg' ]

In [343]:
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [344]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920


In [345]:
categorical

['make', 'model', 'transmission_type', 'vehicle_style']

In [346]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [347]:
df = df.fillna(0)

In [348]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [349]:
df.rename(columns={"msrp": "price"}, inplace=True)

In [350]:
df.transmission_type.value_counts()

transmission_type
automatic           8266
manual              2935
automated_manual     626
direct_drive          68
unknown               19
Name: count, dtype: int64

In [351]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [352]:
from sklearn.metrics import mutual_info_score

In [353]:
df['engine_hp'].corr(df['year'])

0.33871418476244686

In [354]:
df['engine_hp'].corr(df['engine_cylinders'])

0.7748509807813186

In [355]:
df['highway_mpg'].corr(df['engine_cylinders'])

-0.6145414173953334

In [356]:
df['highway_mpg'].corr(df['city_mpg'])

0.8868294962591353

In [357]:
df.price.mean()

40594.737032063116

In [358]:
df['above_average'] = (df.price >= df.price.mean()).astype(int)
df.above_average

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int32

In [359]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135,1
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650,1
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350,0
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450,0
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120,1
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670,1
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620,1
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920,1


In [360]:
from sklearn.model_selection import train_test_split

In [466]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [362]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25 , random_state=42)

In [363]:
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)
df_test  = df_test.reset_index(drop=True)

In [364]:
above_average_train = df_train.above_average.values
above_average_val = df_val.above_average.values
above_average_test = df_test.above_average.values

In [365]:
del df_train['price']
del df_val['price']
del df_test['price']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [366]:
 from sklearn.metrics import mutual_info_score

In [367]:
df[categorical]

Unnamed: 0,make,model,transmission_type,vehicle_style
0,bmw,1_series_m,manual,coupe
1,bmw,1_series,manual,convertible
2,bmw,1_series,manual,coupe
3,bmw,1_series,manual,coupe
4,bmw,1_series,manual,convertible
...,...,...,...,...
11909,acura,zdx,automatic,4dr_hatchback
11910,acura,zdx,automatic,4dr_hatchback
11911,acura,zdx,automatic,4dr_hatchback
11912,acura,zdx,automatic,4dr_hatchback


In [368]:
above_average.value_counts()

price
False    8645
True     3269
Name: count, dtype: int64

In [369]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, above_average_train)

In [370]:
for i in categorical:
    q = mutual_info_churn_score(df_train[i])
    print(i, q.round(2))

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


In [371]:
dicts_train = df_train[categorical + numerical].to_dict(orient='records')

In [372]:
from sklearn.feature_extraction import DictVectorizer

In [373]:
dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(dicts_train)

In [374]:
from sklearn.linear_model import LogisticRegression

In [409]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42).fit(x_train, above_average_train)

In [410]:
val_dicts = train_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [411]:
x_val = dv.transform(val_dicts)

In [412]:
y_pred = model.predict_proba(x_val)[:,1]
y_pred

array([2.39236829e-03, 9.94645108e-01, 2.29997224e-04, ...,
       7.92526260e-04, 9.87109181e-01, 9.78450741e-01])

In [413]:
decision = (y_pred >= 0.5)

In [414]:
(decision == above_average_val).mean().round(2)

0.94

In [415]:
customer = val_dicts[-1]

In [416]:
customer

{'make': 'cadillac',
 'model': 'cts',
 'transmission_type': 'automatic',
 'vehicle_style': 'sedan',
 'year': 2015,
 'engine_hp': 321.0,
 'engine_cylinders': 6.0,
 'highway_mpg': 29,
 'city_mpg': 18}

In [417]:
x_small = dv.transform([customer])

In [418]:
model.predict_proba(x_small)[0, 1]

0.9784507407267088

In [419]:
above_average_val[-1]

1

In [450]:
eli = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

In [463]:
alldf = categorical + numerical

In [465]:
for i in eli:
    alldf.remove(i)
    new_train = df_train[alldf].to_dict(orient='records')
    new_x_train = dv.fit_transform(new_train)
    
    new_model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42).fit(new_x_train, above_average_train)
    
    new_val = df_val[alldf].to_dict(orient='records')
    new_val = dv.transform(new_val)
    
    y_pred = new_model.predict_proba(new_val)[:,1]
    new_decision = (y_pred >= 0.5)
    ans = (new_decision == above_average_val).mean().round(2)
    print('remove {},ans {}'.format(i, ans))
    alldf.append(i)
    

remove year,ans 0.95
remove engine_hp,ans 0.92
remove transmission_type,ans 0.95
remove city_mpg,ans 0.95


In [467]:
linear_full_train, linear_test = train_test_split(df, test_size=0.2, random_state=42)
linear_train, linear_val = train_test_split(linear_full_train, test_size=0.25 , random_state=42)
linear_train = linear_train.reset_index(drop=True)
linear_val   = linear_val.reset_index(drop=True)
linear_test  = linear_test.reset_index(drop=True)

In [468]:
from sklearn.linear_model import LinearRegression

In [469]:
linear = LinearRegression()

In [470]:
linear_model = LinearRegression?