# Import Libraries

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import requests
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, mean_squared_error, mutual_info_score
from sklearn.model_selection import train_test_split

## Prepare files

In [2]:
url_housing = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"

data_dir = Path.cwd().parent / "data"
csv_data = data_dir / "data.csv"

data_dir.mkdir(exist_ok=True)

with requests.Session() as s:
    r = s.get(url_housing)

with csv_data.open("wb") as f:
    f.write(r.content)

## Data Loading

In [3]:
df = pd.read_csv(csv_data)

df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [5]:
df.isna().sum()

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

## Preparing the dataset

In [6]:
df_ori = df.copy()

In [7]:
df = df_ori.copy()

cols = [
    "Make",
    "Model",
    "Year",
    "Engine HP",
    "Engine Cylinders",
    "Transmission Type",
    "Vehicle Style",
    "highway MPG",
    "city mpg",
]

df = df.loc[:, cols + ["MSRP"]]
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.fillna(0, inplace=True)
df.rename(columns={"msrp": "price"}, inplace=True)

In [8]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   price              11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [10]:
df.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

## Question 1

In [11]:
df.describe(include="O")

Unnamed: 0,make,model,transmission_type,vehicle_style
count,11914,11914,11914,11914
unique,48,915,5,16
top,Chevrolet,Silverado 1500,AUTOMATIC,Sedan
freq,1123,156,8266,3048


It's `AUTOMATIC`

## Question 2

In [12]:
numerical_cols = [
    "year",
    "engine_hp",
    "engine_cylinders",
    "highway_mpg",
    "city_mpg",
]

categorical_cols = [
    "make",
    "model",
    "transmission_type",
    "vehicle_style",
]

In [13]:
corr = df[numerical_cols].corrwith(df["price"])

# sorted
print(corr.sort_values(ascending=False))

engine_hp           0.650095
engine_cylinders    0.526274
year                0.227590
city_mpg           -0.157676
highway_mpg        -0.160043
dtype: float64


In [14]:
# sort the absolute values in order to see the strength of the correlation
corr.abs().sort_values(ascending=False)

engine_hp           0.650095
engine_cylinders    0.526274
year                0.227590
highway_mpg         0.160043
city_mpg            0.157676
dtype: float64

`engine_hp` and `engine_cylinders`

#### Making price binary

In [15]:
df["above_average"] = (df["price"] > df["price"].mean()).astype(int)

In [16]:
print(f"{df['price'].mean():.0f}")
df.head()

40595


Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


### Splitting the data

In [17]:
X = df.drop(columns=["price", "above_average"])
y = df[["price", "above_average"]]

In [18]:
X_full_train, X_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_full_train, y_full_train, test_size=0.25, random_state=42
)

## Question 3

In [19]:
{
    col: round(mutual_info_score(X_full_train[col], y_full_train["above_average"]), 2)
    for col in categorical_cols
}

{'make': 0.24, 'model': 0.46, 'transmission_type': 0.02, 'vehicle_style': 0.08}

`vehicle_style`

## Question 4

### Encoding

In [20]:
dv = DictVectorizer(sparse=False)

train_dict = X_train.to_dict(orient="records")
X_train_encoded = dv.fit_transform(train_dict)

val_dict = X_val.to_dict(orient="records")
X_val_encoded = dv.transform(val_dict)

In [21]:
model = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)

In [22]:
model.fit(X_train_encoded, y_train["above_average"])

In [23]:
accuracy = accuracy_score(y_val["above_average"], model.predict(X_val_encoded))
print(round(accuracy, 2))

0.93


## Question 5

In [24]:
feature_elimination_scores = {}
for col in categorical_cols + numerical_cols:
    dv = DictVectorizer(sparse=False)

    train_dict = X_train.drop(columns=col).to_dict(orient="records")
    X_train_encoded = dv.fit_transform(train_dict)

    val_dict = X_val.drop(columns=col).to_dict(orient="records")
    X_val_encoded = dv.transform(val_dict)

    model = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)

    model.fit(X_train_encoded, y_train["above_average"])

    feature_elimination_scores[col] = round(
        accuracy_score(y_val["above_average"], model.predict(X_val_encoded)), 2
    )

In [25]:
pd.Series(
    {feat: (accuracy - score) for feat, score in feature_elimination_scores.items()}
).abs().sort_values()

vehicle_style        0.004536
engine_hp            0.004536
model                0.014536
make                 0.015464
transmission_type    0.015464
year                 0.015464
engine_cylinders     0.015464
highway_mpg          0.015464
city_mpg             0.015464
dtype: float64

Out of the choices given, the answer is `engine_hp`

## Question 6

In [26]:
dv = DictVectorizer(sparse=False)

train_dict = X_train.to_dict(orient="records")
X_train_encoded = dv.fit_transform(train_dict)
y_train_price_log = y_train["price"].apply(np.log)

val_dict = X_val.to_dict(orient="records")
X_val_encoded = dv.transform(val_dict)
y_val_price_log = y_val["price"].apply(np.log)

alpha_scores = {}
for alpha in [0, 0.01, 0.1, 1, 10]:
    ridge = Ridge(alpha=alpha, solver="sag", random_state=42)

    ridge.fit(X_train_encoded, y_train_price_log)

    alpha_scores[alpha] = np.sqrt(
        mean_squared_error(y_val_price_log, ridge.predict(X_val_encoded))
    )



In [27]:
pd.Series(alpha_scores).sort_values()

0.00     0.486860
0.01     0.486860
0.10     0.486862
1.00     0.486884
10.00    0.487098
dtype: float64

`0.00`