In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import pickle

%matplotlib inline
mpl.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

In [None]:
car=pd.read_csv('cotationprice.csv')

In [None]:
car.head()

In [None]:
car.shape

In [None]:
car.info()

##### Check for null values

In [None]:
car.isnull().sum()

## Cleaning Data

#### year is an object. Change to integer

In [None]:
car['year']=car['year'].astype(np.int64)

#### Check for null values

#### convert prices to int64 and remove special char with prices that equal to zero

In [None]:
car['price'] = car['price'].str.replace(r'[^0-9]+', '', regex=True)
car['price'] = car['price'].astype(np.int64)
car = car[car['price']!=0]


In [None]:
car.info()

#### gear_box & Trim has nan values

In [None]:
car=car[~car['gear_box'].isna()]
car=car[~car['trim'].isna()]
car=car[~car['energy'].isna()]

In [None]:
car.shape

### check null values

In [None]:
car.isnull().sum()
car.describe()

#### Resetting the index of the final cleaned data

In [None]:
car=car.reset_index(drop=True)

## Cleaned Data

In [None]:
car

In [None]:
car.to_csv('Cleaned_Car_data.csv')

In [None]:
car.info()

In [None]:
car.isna().sum()

### Checking relationship of Company with Price

In [None]:
car['brand'].unique()
car['model'].unique()
car['trim'].unique()

In [None]:
plt.subplots(figsize=(15,7))
ax=sns.boxplot(x='brand',y='price',data=car)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

In [None]:
# plt.subplots(figsize=(20,10))
# ax=sns.swarmplot(x='year',y='price',data=car)
# ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
# plt.show()

In [None]:
# sns.relplot(x='gear_box',y='price',data=car,height=7,aspect=1.5)

In [None]:
# plt.subplots(figsize=(14,7))
# sns.boxplot(x='gear_box',y='price',data=car)

In [None]:
# ax=sns.relplot(x='brand',y='price',data=car,hue='gear_box',size='year',height=7,aspect=2)
# ax.set_xticklabels(rotation=40,ha='right')

### Extracting Training Data

In [None]:
X=car[['model','brand','gear_box', 'year', 'trim', 'energy']]
y=car['price']

In [None]:
X

In [None]:
y.shape

### Applying Train Test Split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

#### Creating an OneHotEncoder object to contain all the possible categories

In [None]:
ohe=OneHotEncoder()
ohe.fit(X[['model','brand','gear_box','trim','energy']])

#### Creating a column transformer to transform categorical columns

In [None]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['model','brand','gear_box','trim','energy']),remainder='passthrough')

#### Linear Regression Model

In [None]:
lr=LinearRegression()

#### Making a pipeline

In [None]:
pipe=make_pipeline(column_trans,lr)

#### Fitting the  model

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred=pipe.predict(X_test)

#### Checking R2 Score

In [None]:
r2_score(y_test,y_pred)

#### Finding the model with a random state of TrainTestSplit where the model was found to give almost 0.80 as r2_score

In [None]:
scores=[]
for i in range(5):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_trans,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [None]:
np.argmax(scores)

In [None]:
scores[np.argmax(scores)]

In [None]:
X_test.columns

In [377]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['308','PEUGEOT','Automatique',2022,'1.5 BlueHDi 130ch S&S Active Business EAT6','Diesel']).reshape(1,6)))

array([24282.63665619])

#### The best model is found at a certain random state 

In [378]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.9335474955171816

In [379]:
pickle.dump(pipe,open('CarPricePredict.pkl','wb'))

In [380]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['308','PEUGEOT','Automatique',2022,'1.5 BlueHDi 130ch S&S Active Business EAT6','Diesel']).reshape(1,6)))

array([24282.63665619])