In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [None]:
!python -m wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [3]:
 df = pd.read_csv('data.csv')

In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [5]:
df = df.drop(columns=['engine_fuel_type', 'driven_wheels', 'number_of_doors', 'market_category', 'vehicle_size','popularity'])

In [6]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)

In [7]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
msrp                   int64
dtype: object

In [8]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg' ]

In [9]:
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [10]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920


In [11]:
categorical

['make', 'model', 'transmission_type', 'vehicle_style']

In [12]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [13]:
df = df.fillna(0)

In [14]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [15]:
df.rename(columns={"msrp": "price"}, inplace=True)

In [16]:
df.transmission_type.value_counts()

transmission_type
automatic           8266
manual              2935
automated_manual     626
direct_drive          68
unknown               19
Name: count, dtype: int64

In [17]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
df['engine_hp'].corr(df['year'])

0.33871418476244686

In [20]:
df['engine_hp'].corr(df['engine_cylinders'])

0.7748509807813186

In [21]:
df['highway_mpg'].corr(df['engine_cylinders'])

-0.6145414173953334

In [22]:
df['highway_mpg'].corr(df['city_mpg'])

0.8868294962591353

In [23]:
df.price.mean()

40594.737032063116

In [24]:
df['above_average'] = (df.price >= df.price.mean()).astype(int)
df.above_average

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int32

In [25]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135,1
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650,1
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350,0
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450,0
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120,1
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670,1
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620,1
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920,1


In [26]:
from sklearn.model_selection import train_test_split

In [27]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [28]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25 , random_state=42)

In [29]:
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)
df_test  = df_test.reset_index(drop=True)

In [30]:
above_average_train = df_train.above_average.values
above_average_val = df_val.above_average.values
above_average_test = df_test.above_average.values

In [31]:
del df_train['price']
del df_val['price']
del df_test['price']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [32]:
 from sklearn.metrics import mutual_info_score

In [33]:
df[categorical]

Unnamed: 0,make,model,transmission_type,vehicle_style
0,bmw,1_series_m,manual,coupe
1,bmw,1_series,manual,convertible
2,bmw,1_series,manual,coupe
3,bmw,1_series,manual,coupe
4,bmw,1_series,manual,convertible
...,...,...,...,...
11909,acura,zdx,automatic,4dr_hatchback
11910,acura,zdx,automatic,4dr_hatchback
11911,acura,zdx,automatic,4dr_hatchback
11912,acura,zdx,automatic,4dr_hatchback


In [34]:
df.above_average.value_counts()

above_average
0    8645
1    3269
Name: count, dtype: int64

In [35]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, above_average_train)

In [36]:
for i in categorical:
    q = mutual_info_churn_score(df_train[i])
    print(i, q.round(2))

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


In [37]:
dicts_train = df_train[categorical + numerical].to_dict(orient='records')

In [38]:
from sklearn.feature_extraction import DictVectorizer

In [39]:
dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(dicts_train)

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42).fit(x_train, above_average_train)

In [42]:
val_dicts = train_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [43]:
x_val = dv.transform(val_dicts)

In [44]:
y_pred = model.predict_proba(x_val)[:,1]
y_pred

array([2.39236829e-03, 9.94645108e-01, 2.29997224e-04, ...,
       7.92526260e-04, 9.87109181e-01, 9.78450741e-01])

In [45]:
decision = (y_pred >= 0.5)

In [46]:
(decision == above_average_val).mean().round(2)

0.94

In [47]:
customer = val_dicts[-1]

In [48]:
customer

{'make': 'cadillac',
 'model': 'cts',
 'transmission_type': 'automatic',
 'vehicle_style': 'sedan',
 'year': 2015,
 'engine_hp': 321.0,
 'engine_cylinders': 6.0,
 'highway_mpg': 29,
 'city_mpg': 18}

In [49]:
x_small = dv.transform([customer])

In [50]:
model.predict_proba(x_small)[0, 1]

0.9784507407267088

In [51]:
above_average_val[-1]

1

In [52]:
eli = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

In [53]:
alldf = categorical + numerical

In [54]:
for i in eli:
    alldf.remove(i)
    new_train = df_train[alldf].to_dict(orient='records')
    new_x_train = dv.fit_transform(new_train)
    
    new_model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42).fit(new_x_train, above_average_train)
    
    new_val = df_val[alldf].to_dict(orient='records')
    new_val = dv.transform(new_val)
    
    y_pred = new_model.predict_proba(new_val)[:,1]
    new_decision = (y_pred >= 0.5)
    ans = (new_decision == above_average_val).mean().round(2)
    print('remove {},ans {}'.format(i, ans))
    alldf.append(i)
    

remove year,ans 0.95
remove engine_hp,ans 0.92
remove transmission_type,ans 0.95
remove city_mpg,ans 0.95


In [55]:
new_df = df

In [56]:
new_df['price'] = np.log1p(new_df['price'])

In [57]:
new_df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,10.739349,1
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,10.612779,1
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,10.500977,0
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,10.290483,0
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,10.448744,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,10.739024,1
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,10.945018,1
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,10.832122,1
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,10.838031,1


In [124]:
linear_full_train, linear_test = train_test_split(new_df, test_size=0.2, random_state=42)
linear_train, linear_val = train_test_split(linear_full_train, test_size=0.25 , random_state=42)
linear_train = linear_train.reset_index(drop=True)
linear_val   = linear_val.reset_index(drop=True)
linear_test  = linear_test.reset_index(drop=True)

y_train = linear_train.price.values
y_val = linear_val.price.values
y_test = linear_test.price.values

del linear_train['price']
del linear_val['price']
del linear_test['price']
del linear_train['above_average']
del linear_val['above_average']
del linear_test['above_average']

cv = DictVectorizer(sparse=True)
linear_train = linear_train.to_dict(orient='records')
x_train = cv.fit_transform(linear_train)

linear_val = linear_val.to_dict(orient='records')
x_val = cv.transform(linear_val)

In [125]:
from sklearn.linear_model import LinearRegression

In [126]:
y_test


array([10.29876764, 10.32535081, 10.53611479, ..., 10.3656795 ,
        9.95208722, 10.96302978])

In [127]:
alpha_values = [0, 0.01, 0.1, 1, 10]

In [128]:
rmse_scores = {}

In [129]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt

In [130]:
linear_train['year']

TypeError: list indices must be integers or slices, not str

In [146]:
for alpha in alpha_values:
    linear_model = Ridge(alpha=alpha, solver='sag', random_state=42)
    linear_model.fit(x_train, y_train)
    y_pred = linear_model.predict(x_val)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    
    rmse_scores[alpha] = round(rmse, 3)


In [147]:
rmse_scores

{10: 1.537, 0: 1.574, 0.01: 1.574, 0.1: 1.573, 1: 1.565}

alpha 0, ans 1.574
alpha 0.01, ans 1.565


KeyError: 2

In [138]:
print('sdf')

sdf
