In [1]:
#Importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 40)

In [None]:
#Importing and viewing data
cars= pd.read_csv('CarPrice_Assignment.csv', index_col= 'car_ID')
cars.head()

In [3]:
#changing column names to lower casing
cars.columns= cars.columns.str.lower()

In [None]:
#checking for duplicates
print('There are {} number of duplicates'.format(cars.duplicated().sum()))
#checking for number of nulls
cars.isna().sum()

In [None]:
#checking statistical summary of data
cars.describe()

In [6]:
#Getting the car manufacturer name from the car names
pat= r"[ ,.!?-]+"
carnames= cars['carname'].str.split(pat)
carlist= []
for word in carnames:
    carlist.append(word[0])
    
#Updating the carname column with the manufacturer names
cars['carname'] = carlist
cars['carname']= cars['carname'].str.lower()

In [7]:
#Cleaning the name column by making appropriate replacements
replacements= {'toyouta':'toyota', 'porcshe':'porsche', 'maxda':'mazda', 'vokswagen':'volkswagen'}
cars['carname']= cars['carname'].replace(replacements)

In [8]:
#fixing the door number column
cars['doornumber']= cars['doornumber'].replace({'two':2, 'four':4})

#fixing the cylinder number column
cars['cylindernumber']= cars['cylindernumber'].replace({'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8})

In [9]:
#Creating a function for EDA
def viz_plot(df, column, target= 'price'):
    """
    Function to plot data depending on the data type of the column specified.

    Parameters
    ----------
    df : DataFrame
            The dataframe containing the data
    column : str
            The column to be plotted
    target: price
            Target column

    Returns
    -------
    None
        Displays the plot
    """
    #For numerical columns
    if pd.api.types.is_numeric_dtype(df[column]):
        fig, ax= plt.subplots(1, 3, figsize= (12,5))

        #histogram plot
        sns.histplot(df[column], kde=True, ax= ax[0])
        ax[0].set_title(f'Histogram of {column}')

        #boxplot
        sns.boxplot(df[column], ax= ax[1])
        ax[1].set_title(f'Boxplot of {column}')

        #scatterplot
        sns.scatterplot(data=df, x= target, y= column, alpha= 0.5, ax= ax[2])
        ax[2].set_title(f'Scatterplot of {target} against {column}')


        plt.tight_layout()
        plt.show()
    #For non-numerical columns
    elif pd.api.types.is_object_dtype(df[column]):
        plt.figure(figsize=(8,6))
        sns.countplot(x=df[column])
        plt.title(f'Countplot of {column}')
        plt.xticks(rotation= 45)
        plt.tight_layout()
        plt.show()
    else:
        print('Unsupported column data type')

In [10]:
#Creating function to filter for outliers
def outliers(df, column):
    """
    Function to detect ouliers in a column using iqr method

    Parameters
    ----------
    df : DataFrame
            The dataframe containing the data
    column : str
            The column to be filtered on
    
    Returns
    -------
    flr: DataFrame
            Filtered dataframe
    """
    #calculating the 25th and 75th percentile
    q25, q75= np.percentile(df[column], [25, 75])
    #calculating iqr
    iqr= q75 - q25
    #defining upper and lower boundary
    upp_bd= q75 + (iqr * 1.5)
    lower_bd= q25 - (iqr * 1.5)
    #applying the upper and lower boundary filter
    flr= (df[column] < upp_bd) & (df[column] > lower_bd)

    return flr, upp_bd, lower_bd   

In [None]:
#Feature correlation with price
price_corr= cars.corr(numeric_only=True)['price'].sort_values(ascending=False)
price_corr

In [None]:
# Most purchased type of car
plt.figure(figsize=(10,5))
sns.countplot(data=cars, x= 'carname', palette='viridis',)
plt.title('Count per car')
plt.xlabel('Car Names')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
#Relationship between engine size and price
plt.figure(figsize=(8,5))
sns.scatterplot(data=cars, x='price', y='enginesize', alpha=0.5, palette='viridis')
plt.xlabel('Price')
plt.ylabel('Engine size')
plt.title('Relationship Engine size and price')
plt.show()

In [None]:
#Most purchased type of car body
plt.figure(figsize=(8,5))
sns.countplot(data=cars, x='carbody', palette='viridis')
plt.xlabel('Car Body')
plt.title('Count per Car body')
plt.show()

In [None]:
#MCount of purchases by fuel type
plt.figure(figsize=(8,5))
sns.countplot(data=cars, x='fueltype', palette='viridis')
plt.xlabel('Fuel Type')
plt.title('Count by Fuel type')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=cars, x='price', y='enginesize', hue='carbody', alpha=0.5)
plt.xlabel('Price')
plt.ylabel('Engine Size')
plt.title('Relationship Between Engine Size and Price per Car Body Type')
plt.show()

In [17]:
#Feature Engineering
#Power to weight ratio
cars['powerweightratio']= cars['horsepower']/cars['curbweight']

#Car volume
cars['carvolume']= cars['carlength'] * cars['carheight'] * cars['carheight']

#Average mpg: Takes the average of the city and highway mpg to get a balanced fuel consumption
cars['averagempg']= (cars['highwaympg'] + cars['citympg'])/ 2

#Compactness ratio: a lower ratio indicates a more compact design
cars['compactnessratio']= cars['carlength']/cars['carwidth']


In [18]:
#drop citympg, highwaympg, symboling, door number, peakrpm, car width, length, height, stroke, compression ratio
cars_c= cars.drop(columns=['citympg', 'highwaympg', 'symboling', 'peakrpm', 'doornumber', 'carwidth', 'carlength', 'carheight', 'stroke', 'compressionratio'])

In [19]:
#creating dummies for categorical columns
cars_encoded= pd.get_dummies(cars_c, columns=['fueltype', 'aspiration', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem'], drop_first=True, sparse=False, dtype='int')
new_cars= cars_encoded.drop('carname', axis=1)

In [20]:
#importing model libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

#Instantiating libraries
scaler= StandardScaler()
ridge= Ridge()
lasso= Lasso()

In [None]:
#Splitting data
x= cars_encoded.drop(['price', 'carname'], axis=1)
y= cars['price']
x_tr, x_ts, y_tr, y_ts= train_test_split(x, y, random_state=42, shuffle=True, test_size=0.1)

#Scaling data
x_train= scaler.fit_transform(x_tr)
x_test= scaler.transform(x_ts)

#Model cross validation for Lasso model
parameter= {'alpha':np.linspace(0.001, 1)}
kfd= KFold(n_splits=10, shuffle=True, random_state=42)
cv= RandomizedSearchCV(lasso, param_distributions=parameter, cv=kfd)

#Fitting model and evaluating metrics
cv.fit(x_train, y_tr)
l_pred= cv.predict(x_test)
lr2_score= cv.score(x_test, y_ts)
lmse= mean_squared_error(y_ts, l_pred)
lrmse= np.sqrt(lmse)

#Model cross validation for Ridge model
parameter= {'alpha':np.linspace(0.001, 1)}
kfd= KFold(n_splits=10, shuffle=True, random_state=42)
cv= RandomizedSearchCV(ridge, param_distributions=parameter, cv=kfd)

#Fitting model and evaluating metrics
cv.fit(x_train, y_tr)
r_pred= cv.predict(x_test)
R2_score= cv.score(x_test, y_ts)
mse= mean_squared_error(y_ts, r_pred)
rmse= np.sqrt(mse)

print(f'Lasso R-squared: {np.round(lr2_score, 2)}')
print(f'Lasso MSE: {np.round(lmse, 2)}')
print(f'Lasso RMSE: {np.round(lrmse, 2)}\n')
print(f'Ridge R-squared: {np.round(R2_score, 2)}')
print(f'Ridge MSE: {np.round(mse, 2)}')
print(f'Ridge RMSE: {np.round(rmse, 2)}')