In [4]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split

In [5]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [6]:
!wget $data

--2023-10-02 05:00:53--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv.1’


2023-10-02 05:00:53 (54.3 MB/s) - ‘data.csv.1’ saved [1475504/1475504]



In [7]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


### Data Prepration

In [8]:
feature_list = ["Make","Model","Year","Engine HP","Engine Cylinders","Transmission Type","Vehicle Style","highway MPG","city mpg","msrp"]

In [9]:
df.columns = df.columns.str.lower().str.replace(" ","_")


string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [10]:
feature_list= ["make","model","year","engine_hp","engine_cylinders","transmission_type","vehicle_style","highway_mpg","city_mpg","msrp"]
df_subset = df[feature_list]
df_subset


Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920


In [8]:

df_subset.isnull().sum()




make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [11]:
df_subset = df_subset.fillna(0)

In [12]:
df_subset.rename(columns={'msrp':'price'},inplace=True)

In [95]:
df_subset['transmission_type'].mode()

0    automatic
Name: transmission_type, dtype: object

In [30]:
df_subset['transmission_type'].value_counts()

automatic           8266
manual              2935
automated_manual     626
direct_drive          68
unknown               19
Name: transmission_type, dtype: int64

In [116]:
df_subset['engine_hp'].corr(df.year)

0.3387141847624468

In [153]:
numeric_columns = list(df_subset.select_dtypes([np.number]).columns)
numeric_columns

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']

In [145]:
df_subset[numeric_columns].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [154]:
above_average = df_subset['price'].mean()
above_average

40594.737032063116

In [155]:
df_subset['price'] = list(df_subset['price'].apply(lambda x:1 if x > above_average else 0))
df_subset

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,1
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,1
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,0
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,0
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,0
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,1
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,1
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,1
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,1


In [13]:
df_full_train,df_test = train_test_split(df_subset,test_size=0.2,random_state=42)

In [14]:
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [15]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [159]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

y_train

array([0, 0, 1, ..., 0, 0, 0])

In [160]:
categorical =  list(df_subset.dtypes[df_subset.dtypes == 'object'].index)
categorical

['make', 'model', 'transmission_type', 'vehicle_style']

In [105]:
def calculate_mi(series):
    return mutual_info_score(series, df_train.price)

df_mi = df_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

Unnamed: 0,MI
model,5.565079
make,2.802731
vehicle_style,1.754973
transmission_type,0.600021


Unnamed: 0,MI
model,5.565079
make,2.802731
vehicle_style,1.754973
transmission_type,0.600021


In [161]:
del df_train['price']
del df_val['price']
del df_test['price']

In [19]:
from sklearn.feature_extraction import DictVectorizer

In [23]:
# cat = ['make','model','transmission_type','vehicle_style','engine_hp','city']
dicts = df_train.to_dict(orient='records')

In [24]:
dv = DictVectorizer(sparse=False)

In [25]:

X_train = dv.fit_transform(dicts)

In [26]:
X_train.shape

(7148, 943)

In [27]:
val_dicts = df_val.to_dict(orient='records')
val_dicts[0]

{'make': 'volkswagen',
 'model': 'beetle',
 'year': 2015,
 'engine_hp': 210.0,
 'engine_cylinders': 4.0,
 'transmission_type': 'manual',
 'vehicle_style': '2dr_hatchback',
 'highway_mpg': 31,
 'city_mpg': 23}

In [28]:
X_val = dv.transform(val_dicts)
X_val
X_val.shape

(2383, 943)

In [84]:
def logistic_regression(xi):
  score = w0

  for j in range(len(w)):
    score = score + xi[j] * w[j]

  result = sigmoid(score)

  return result


In [223]:
from sklearn.linear_model import LogisticRegression

In [224]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)


In [225]:
model.fit(X_train,y_train)

In [226]:
model.predict(X_train)

array([0, 0, 1, ..., 0, 0, 0])

In [227]:
y_pred = model.predict_proba(X_val)[:,1]

In [228]:
price_pred = (y_pred>=0.5)

In [229]:
price_pred = price_pred.astype(int)

In [230]:
(y_val==price_pred).mean()

0.9471254720939991

In [231]:
df_pred_q4 = pd.DataFrame()
df_pred_q4['probablity'] = y_pred
df_pred_q4['prediction'] = price_pred.astype(int)
df_pred_q4['actual'] = y_val





In [232]:
df_pred_q4['correct'] = df_pred_q4['prediction'] == df_pred_q4['actual']

In [235]:
(df_pred_q4['correct'].mean()).round(2)

0.95

In [236]:
df_train.dtypes


make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

Question 5

In [216]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)


In [254]:
def accuracy(li):
  dicts = df_train[li].to_dict(orient='records')
  dv = DictVectorizer(sparse=False)
  X_train = dv.fit_transform(dicts)
  val_dicts = df_val[li].to_dict(orient='records')
  X_val = dv.transform(val_dicts)
  model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
  model.fit(X_train,y_train)
  model.predict(X_train)
  y_pred = model.predict_proba(X_val)[:,1]
  price_pred = (y_pred>=0.5)
  price_pred = price_pred.astype(int)
  return (y_val==price_pred).mean()





  # cat = []


In [261]:
li = ['make','model','transmission_type','vehicle_style','engine_hp','year','engine_hp','engine_cylinders','highway_mpg','city_mpg']
n = 9
while n >=1:
  print("accuracy without {}".format(li[n] ))
  print(accuracy(li[:n]))
  print()
  n -= 1

accuracy without city_mpg


  dicts = df_train[li].to_dict(orient='records')
  val_dicts = df_val[li].to_dict(orient='records')


0.9324381032312211

accuracy without highway_mpg


  dicts = df_train[li].to_dict(orient='records')
  val_dicts = df_val[li].to_dict(orient='records')
  dicts = df_train[li].to_dict(orient='records')


0.9211078472513639

accuracy without engine_cylinders


  val_dicts = df_val[li].to_dict(orient='records')


0.9429290809903483

accuracy without engine_hp
0.9429290809903483

accuracy without year
0.9475451112043642

accuracy without engine_hp
0.9341166596726815

accuracy without vehicle_style
0.9227864036928242

accuracy without transmission_type
0.922366764582459

accuracy without model
0.8552245069240453



In [16]:
y_train_org = df_train.price.values
y_val_org = df_val.price.values
y_test_org = df_test.price.values

y_train_org

array([ 33599,  26245, 248000, ...,  28345,   2000,  40220])

In [17]:
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

In [20]:
from sklearn.preprocessing import StandardScaler
numerical_cols_df = [
      col for col in df_train.columns if df_train[col].dtype != 'object'
  ]
scaler = StandardScaler()
for col in numerical_cols_df:
  df_train[col] = scaler.fit_transform(df_train[col].values.reshape(-1, 1))
  df_val[col] = scaler.transform(df_val[col].values.reshape(-1, 1))
  df_test[col] = scaler.transform(df_test[col].values.reshape(-1, 1))

dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [21]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge


In [22]:
model = Ridge(alpha=0.1,solver='sag', random_state=42, max_iter=7600)
model.fit(X_train,y_train)

In [26]:
y_pred = model.predict(X_val)

In [27]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print('RMSE:', rmse)

RMSE: 0.21830657372954623


In [28]:
alpha_li = [0,0.01,0.1,1,10]
for i in alpha_li:
  model = Ridge(alpha=i,solver='sag', random_state=42, max_iter=7600)
  model.fit(X_train,y_train)
  y_pred = model.predict(X_val)
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))
  print('RMSE: ', rmse)
  print('alpha:', i)



RMSE:  0.2176869702212026
alpha: 0
RMSE:  0.21772931784183022
alpha: 0.01
RMSE:  0.21830657372954623
alpha: 0.1
RMSE:  0.23128740665436523
alpha: 1
RMSE:  0.32101136901021105
alpha: 10
