Answers:
1) AUTOMATIC  
2) highway_mpg and city_mpg  
3) transmission_type  
4) 0.95  
5) year  
6) 0

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
df.shape

(11914, 16)

In [5]:
columns = ["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", "highway MPG", "city mpg", "MSRP"]
df = df[columns]

In [6]:
df.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [7]:
df.columns = df.columns.str.replace(" ", "_").str.lower()

In [8]:
df = df.fillna(0)

In [9]:
df.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [10]:
df = df.rename({"msrp": "price"}, axis=1)

In [11]:
df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

In [12]:
numeric_cols = list(df.select_dtypes(include=["int", "float"]).columns)
categorical_cols = list(df.select_dtypes(include=["object"]).columns)
numeric_cols

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']

In [13]:
df[numeric_cols].corr().style.background_gradient(cmap="coolwarm").format(precision=2)

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.34,-0.04,0.26,0.2,0.23
engine_hp,0.34,1.0,0.77,-0.42,-0.42,0.65
engine_cylinders,-0.04,0.77,1.0,-0.61,-0.59,0.53
highway_mpg,0.26,-0.42,-0.61,1.0,0.89,-0.16
city_mpg,0.2,-0.42,-0.59,0.89,1.0,-0.16
price,0.23,0.65,0.53,-0.16,-0.16,1.0


highway_mpg and city_mpg have the biggest correlation

In [14]:
mean_price = df.price.mean()
above_average = (df.price > mean_price).astype(int)
above_average.head()

0    1
1    1
2    0
3    0
4    0
Name: price, dtype: int32

In [15]:
new_df = df.drop("price", axis=1)

In [16]:
X_full_train, X_test, y_full_train, y_test = train_test_split(new_df, above_average, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, random_state=42)

In [17]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((7624, 9), (1907, 9), (2383, 9), (7624,), (1907,), (2383,))

In [18]:
def mutual_info_price_score(series):
    return mutual_info_score(y_train, series).round(2)

In [19]:
mi = X_train[categorical_cols].apply(mutual_info_price_score)
mi.sort_values(ascending=False)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

The lowest score: transmission_type: 0.02

In [20]:
numeric_cols.remove("price")

In [21]:
dv = DictVectorizer(sparse=False)

train_dict = X_train[categorical_cols + numeric_cols].to_dict(orient="records")
X_train_ = dv.fit_transform(train_dict)

val_dict = X_val[categorical_cols + numeric_cols].to_dict(orient="records")
X_val_ = dv.transform(val_dict)

In [22]:
model = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)

model.fit(X_train_, y_train)

In [23]:
price_decision = model.predict(X_val_)

In [24]:
original_accuracy = (y_val == price_decision).mean().round(2)
original_accuracy

0.94

Score: 0.95

In [25]:
all_cols = numeric_cols + categorical_cols

def get_acc_without_col(col):
    amputated_cols = all_cols.copy()
    amputated_cols.remove(col)
    
    dv = DictVectorizer(sparse=False)
    
    train_dict = X_train[amputated_cols].to_dict(orient="records")
    X_train_ = dv.fit_transform(train_dict)
    
    val_dict = X_val[amputated_cols].to_dict(orient="records")
    X_val_ = dv.transform(val_dict)
    
    model = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)
    model.fit(X_train_, y_train)
    
    price_decision = model.predict(X_val_)
    val_accuracy = (y_val == price_decision).mean()
    
    return (original_accuracy - val_accuracy).round(3)

In [26]:
diff_acc = {}

for col in all_cols:
    diff_acc[col] = get_acc_without_col(col)

print(*sorted(diff_acc.items(), key=lambda x: x[1]))

('year', -0.007) ('highway_mpg', -0.006) ('make', -0.005) ('transmission_type', -0.005) ('city_mpg', -0.002) ('engine_cylinders', -0.0) ('vehicle_style', 0.008) ('engine_hp', 0.011) ('model', 0.02)


The smallest difference: "year", -0.007

In [27]:
df_log_price = df.copy()
df_log_price.price = np.log1p(df_log_price.price)

In [28]:
X_full_train, X_test, y_full_train, y_test = train_test_split(df_log_price.drop(["price"], axis=1), df_log_price.price, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.2, random_state=42)

In [29]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((7624, 9), (1907, 9), (2383, 9), (7624,), (1907,), (2383,))

In [30]:
dv = DictVectorizer(sparse=False)

train_dict = X_train.to_dict(orient="records")
X_train_ = dv.fit_transform(train_dict)

val_dict = X_val.to_dict(orient="records")
X_val_ = dv.transform(val_dict)

In [31]:
model = Ridge(solver="sag", max_iter=1000, random_state=42)

model.fit(X_train_, y_train)
None



In [32]:
def rmse(y, y_pred):
    se = (y_pred - y) ** 2
    mse = se.mean()
    
    return np.sqrt(mse)

In [33]:
reg_prediction = model.predict(X_val_)
rmse(y_val, reg_prediction)

0.482546524041356

In [34]:
def get_rmse_from_alpha(a):
    model = Ridge(alpha=a, solver="sag", max_iter=1000, random_state=42)
    model.fit(X_train_, y_train);

    pred = model.predict(X_val_)
    
    return rmse(y_val, pred)

In [35]:
alphas = [0, 0.01, 0.1, 1, 10]
rmses = {}

for a in alphas:
    rmses[a] = get_rmse_from_alpha(a).round(3)



In [36]:
dict(sorted(rmses.items(), key=lambda x: x[1]))

{0: 0.483, 0.01: 0.483, 0.1: 0.483, 1: 0.483, 10: 0.483}