In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [2]:
#Load clean dataset
df_cars = pd.read_csv("../data/clean/cars_clean.csv")
df_cars.head()

Unnamed: 0,make,model,year,engine fuel type,engine hp,engine cylinders,transmission type,driven_wheels,number of doors,market category,vehicle size,vehicle style,highway mpg,city mpg,popularity,price
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [11]:
# Function to rank categorical features based on mean MSRP

def calculate_mean_price_ratio(df, ordinal_columns):
    result_dict = {}
    for col in ordinal_columns:
        # Calculate the mean price for each category in the ordinal column
        col_price = df.groupby(col)['price'].mean().sort_values(ascending=False).reset_index()
        # Calculate the ratio of each mean price to the lowest mean price
        col_price['ratio'] = col_price['price'] / col_price.iloc[-1, -1]
       #setting index
        col_price.set_index(col, inplace=True)
        col_result = {val: col_price.loc[val, 'ratio'] for val in df[col].unique()}
        result_dict[col] = col_result
        df[col] = df[col].map(col_result)
    return df, result_dict

df_cars, result_dictionary = calculate_mean_price_ratio(df_cars, ['make', 'engine fuel type', 'model', 'transmission type', 'driven_wheels', 'market category', 'vehicle size', 'vehicle style'])

In [12]:
df_cars

Unnamed: 0,make,model,year,engine fuel type,engine hp,engine cylinders,transmission type,driven_wheels,number of doors,market category,vehicle size,vehicle style,highway mpg,city mpg,popularity,price
0,15.096756,23.067500,2011,3.880438,335.0,6.0,4.716647,2.690938,2.0,7.225191,1.000000,5.507340,26,19,3916,46135
1,15.096756,18.926562,2011,3.880438,300.0,6.0,4.716647,2.690938,2.0,3.783921,1.000000,5.659647,28,19,3916,40650
2,15.096756,18.926562,2011,3.880438,300.0,6.0,4.716647,2.690938,2.0,7.210302,1.000000,5.507340,28,20,3916,36350
3,15.096756,18.926562,2011,3.880438,230.0,6.0,4.716647,2.690938,2.0,3.783921,1.000000,5.507340,28,18,3916,29450
4,15.096756,18.926562,2011,3.880438,230.0,6.0,4.716647,2.690938,2.0,2.649633,1.000000,5.659647,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8167,8.557548,25.531429,2012,3.880438,300.0,6.0,5.950325,2.374099,4.0,4.146657,1.037504,1.329218,23,16,204,46120
8168,8.557548,25.531429,2012,3.880438,300.0,6.0,5.950325,2.374099,4.0,4.146657,1.037504,1.329218,23,16,204,56670
8169,8.557548,25.531429,2012,3.880438,300.0,6.0,5.950325,2.374099,4.0,4.146657,1.037504,1.329218,23,16,204,50620
8170,8.557548,25.531429,2013,1.560714,300.0,6.0,5.950325,2.374099,4.0,4.146657,1.037504,1.329218,23,16,204,50920


In [None]:
make_price = df_cars.groupby('make')['price'].mean().sort_values(ascending=False).reset_index()
make_price

In [None]:
make_price['ratio'] = (make_price['price'])/(make_price.iloc[-1,-1])
make_price

In [None]:
make_price.set_index("make", inplace=True)
result = {make: make_price.loc[make, 'ratio'] for make in df_cars['make'].unique()}

In [None]:
df_cars['make'] = df_cars['make'].map(result)
df_cars

In [None]:
#Establish 'features' and 'target'
features = df_cars.drop('price', axis=1)
target = df_cars['price']

In [None]:
#Perform Train Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

In [None]:
X_train_cat.columns

In [None]:
X_test['make'].nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

X_cat = features.select_dtypes('object')
cat = [ list(X_cat[col].unique()) for col in X_cat.columns ]

#Initiate OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, categories=cat)

#Split train set into categorical and numerical variables
X_train_cat = X_train.select_dtypes(include = [object])

# Futher split X_train_cat = X_train_cat_nom + X_train_cat_ord

X_train_num = X_train.drop(X_train_cat, axis = 1)

#Fit OHE with categorical variables and transform them into numerical variables
ohe.fit(X_train_cat)
X_train_trans_np = ohe.transform(X_train_cat)

#Convert the np to df 
X_train_trans_df = pd.DataFrame(X_train_trans_np, columns=ohe.get_feature_names_out(), index = X_train.index)

#Concatenate the transformed train dataframe with the numerical train df
df_train = pd.concat([X_train_trans_df, X_train_num], axis = 1)
df_train.head()

In [None]:
#Split test set into categorical and numerical variables
X_test_cat = X_test.select_dtypes(include = [object])
X_test_num = X_test.drop(X_test_cat, axis=1)

#Transform the categorical data into numerical values
X_test_trans_np = ohe.transform(X_test_cat)

#Create a DataFrame using transformed values
X_test_trans_df = pd.DataFrame(X_test_trans_np, columns=ohe.get_feature_names_out(), index=X_test.index)

#Concatenate the transformed test df with the test num df
df_test = pd.concat([X_test_trans_df, X_test_num], axis = 1)
df_test.head()