In [None]:
This is my first Kaggle Notebook that I'll be uploading , the primary goal hereis to analyse the data and try to create a model to predict the prices and explore various data preprocessing and regression models .

In [None]:
!pip install fancyimpute

In [None]:
!pip install fasteda

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from scipy.special import boxcox1p
import plotly.express as px
# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.metrics import r2_score
#RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn import ensemble
from sklearn.model_selection import cross_val_score,GridSearchCV,RepeatedStratifiedKFold,StratifiedKFold,KFold
import seaborn as sns
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder, CatBoostEncoder
from imblearn.under_sampling import RandomUnderSampler
from fasteda import fast_eda
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Import libraries for Hypertuning
import optuna

# Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
#Import the dataset
dataset = pd.read_csv("/kaggle/input/sports-car-prices-dataset/Sport car price.csv")

In [None]:
dataset.info()

In [None]:
## Renaming the columns to make refering them more easier
dataset = dataset.rename(columns = {'Car Make': 'car_make',
    'Car Model': 'car_model',
    'Year': 'year',
    'Engine Size (L)': 'engine_size_L',
    'Horsepower': 'horsepower',
    'Torque (lb-ft)': 'torque',
    '0-60 MPH Time (seconds)': 'acceleration_seconds',
    'Price (in USD)': 'price_usd'
})

In [None]:
# Almost the entire dataset is given in the format of object data type to make a proper analysis 
# To do this I used List Comprehension with regular expressions to convert the data types to int or float depending on the feature
import re
pattern = r'^-?\d+(?:\.\d+)?$'
dataset['price_usd'] = [int(X.replace(",","")) for X in dataset['price_usd']]
dataset['engine_size_L'] = [float(X) if re.search(pattern,str(X)) else None for X in dataset['engine_size_L']]
dataset['horsepower'] = [int(X) if re.search(r'^[0-9]+$',str(X)) else None for X in dataset['horsepower']]
dataset['torque'] = [int(X) if re.search(r'^[0-9]+$',str(X)) else None for X in dataset['torque']]
dataset['acceleration_seconds'] = [float(X) if re.search(pattern,str(X)) else None for X in dataset['acceleration_seconds']]

In [None]:
dataset['price_usd'].describe()

In [None]:
# Group by Car Model and count the number of cars
car_counts = dataset.groupby('car_make').size().reset_index(name='counts')

# Filter models with at least 10 cars
#car_counts = car_counts[car_counts['counts'] >= 10]

# Create the interactive bar chart
fig = px.bar(car_counts, x='car_make', y='counts', title='Number of cars produced by Carmake', 
             labels={'carmodel': 'Carmake', 'counts': 'Number of cars'})

# Show the plot
fig.show()

The "fast_eda" module in Python is a data analysis library that provides a quick and efficient way to perform exploratory data analysis (EDA) on large datasets. The fast_eda library is designed to simplify the EDA process by providing a set of functions and tools that allow users to quickly generate descriptive statistics, visualize data, and identify potential issues in the data.

In [None]:
#Doing a fast Exploratory data analysis of the dataset
fast_eda(dataset)

In [None]:
#Create an array of columns
columns = np.array(dataset.columns)

In [None]:
#Plot the regression plot to understand the relationship
#between the features and the target variables
for col in columns[2:]:
    sns.regplot(x=col, y='price_usd', color = 'navy',data =dataset)
    plt.show()

In [None]:
# I encoded the categorical variables with Target Encoding instead of 
# Label Encoding and One Hot Encoding because it gave me better results , 
# if there are better ways to encode please let me know 
import category_encoders as ce
cat_features = dataset.select_dtypes(include= ['object']).columns
encoders = ce.TargetEncoder(cols = cat_features)
target = dataset[['price_usd']]
train = dataset.drop(['price_usd'],axis = 1)
encoded_features = encoders.fit_transform(train,target)

In [None]:
# Transform skewed data into a more normal distribution using boxcox1p function
numeric_features = encoded_features.dtypes[encoded_features.dtypes != "object"].index
from scipy.stats import skew
skewed_feats = encoded_features[numeric_features].apply(lambda x: skew(x.dropna())).sort_values(ascending = False)
skewness = pd.DataFrame({'Skew':skewed_feats})
skewness = skewness[abs(skewness['Skew'])>0.75]
skewed_features = skewness.index
for i in skewed_features:
    encoded_features[i] = boxcox1p(encoded_features[i],0.15)

In [None]:
#Imputing the data using Multiple Imputation by Chained Equations
from fancyimpute import IterativeImputer
imputer = IterativeImputer(random_state = 4)
encoded_imputed_features =  imputer.fit_transform(encoded_features)

In [None]:
# Reset_index for dataframe manually
dataset = pd.DataFrame(data = encoded_imputed_features,columns = columns[:-1])

In [None]:
# Split the data into training and test set 0.8:0.2
X_train,X_test,Y_train,Y_test = train_test_split(dataset,target,test_size=0.2, random_state = 4)

In [None]:
lm = LinearRegression()
lm.fit(X_train,Y_train)
rfe = RFE(lm)
rfe= rfe.fit(X_train,Y_train)

In [None]:
dataset.torque

In [None]:
def build_model(X,y):
    X = sm.add_constant(X) #Adding the constant
    lm = sm.OLS(y,X).fit() # fitting the model
    print(lm.summary()) # model summary
    return X
    
def checkVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

In [None]:
X_train_new= build_model(X_train,Y_train)

With those results, the not significant columns are erased from the Linear Regression model to see if the R-squared with fewer variables keeps almost the same value. For choosing those variables the following criteria were applied:

In a numerical column only a variable is preserved if the P-value is less than 0.05

After the analysis , the model will be trained again:

In [None]:
X_train_new = X_train.drop(['year','torque'], axis = 1)
X_test = X_test.drop(['year','torque'], axis = 1)

In [None]:
stats = build_model(X_train_new,Y_train)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(loss='huber', max_depth=2, n_estimators=4000,
                          random_state=1, subsample=0.75)

In [None]:
model.fit(X_train_new,Y_train)

In [None]:
Y_pred = model.predict(X_test)
Y_pred_train = model.predict(X_train_new)
print('RMSE train data: %.3f, RMSE test data: %.3f' % (
np.sqrt(mean_squared_error(Y_train,Y_pred_train)),
np.sqrt(mean_squared_error(Y_test,Y_pred))))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(Y_train,Y_pred_train),
r2_score(Y_test,Y_pred)))

In [None]:
from sklearn.linear_model import LassoCV,ElasticNet
lasso = LassoCV(cv=10, random_state=0).fit(X_train_new, Y_train)
# Evaluate model on test data
Y_pred = lasso.predict(X_test)
Y_pred_train = lasso.predict(X_train_new)
print('MSE train data: %.3f, MSE test data: %.3f' % (
np.sqrt(mean_squared_error(Y_train,Y_pred_train)),
np.sqrt(mean_squared_error(Y_test,Y_pred))))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(Y_train,Y_pred_train),
r2_score(Y_test,Y_pred)))

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 100,
                              random_state = 1,
                              n_jobs = -1)
forest.fit(X_train_new,Y_train)
forest_train_pred = forest.predict(X_train_new)
forest_test_pred = forest.predict(X_test)
print('MSE train data: %.3f, MSE test data: %.3f' % (
np.sqrt(mean_squared_error(Y_train,forest_train_pred)),
np.sqrt(mean_squared_error(Y_test,forest_test_pred))))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(Y_train,forest_train_pred),
r2_score(Y_test,forest_test_pred)))