In [None]:
import numpy as np 
import pandas as pd

# Visualization Libraries
import matplotlib.pyplot as plt
import matplotlib.patches as pcs
import seaborn as sns
import plotly.express as px


# ML Libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("/kaggle/input/cardata/cardata.csv")
df = pd.DataFrame(data)
df.head()

In [None]:
df.info()

In [None]:
df.describe(exclude="object")

In [None]:
df.describe(include="object")

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()]

In [None]:
dup1 = df[(df["Car_Name"] == "ertiga") & (df["Year"] == 2016)]
dup2 = df[(df["Car_Name"] == "fortuner") & (df["Year"] == 2015)]
duplicates = dup1._append(dup2, ignore_index=True).style.background_gradient(axis=0)
duplicates

In [None]:
df.drop_duplicates(inplace=True)
df = df.reset_index(drop=True)


df.describe(exclude="object").T

In [None]:
df.describe(include="object").T

In [None]:
fig1, ax1 = plt.subplots(1,3, figsize=(20, 6), dpi = 80)
sns.scatterplot(data=df, x='Year', y='Selling_Price', ax=ax1[0])
sns.scatterplot(data=df, x='Present_Price', y='Selling_Price', ax=ax1[1])
sns.scatterplot(data=df, x='Kms_Driven', y='Selling_Price', ax=ax1[2])

#add Circles
ax1[0].add_patch(pcs.Ellipse((2010, 35), 1,2,color='red',fill = False, linewidth=2))
ax1[1].add_patch(pcs.Ellipse((93, 35), 6,2,color='red',fill = False, linewidth=2))
ax1[2].add_patch(pcs.Ellipse((500000, 0), 35000,2,color='red',fill = False, linewidth=2))
ax1[2].add_patch(pcs.Ellipse((78000,35), 35000,2,color='red',fill = False, linewidth=2))


for ax in ax1:
    ax.grid(True)
    ax.set_axisbelow(True)
    ax.set_facecolor('#e5eefd')

In [None]:
fig2, ax2 = plt.subplots(1,4 , figsize=(20, 5), dpi = 80)

sns.scatterplot(data=df, x='Fuel_Type', y= 'Selling_Price', ax=ax2[0])
sns.scatterplot(data=df, x='Seller_Type', y= 'Selling_Price', ax=ax2[1])
sns.scatterplot(data=df, x='Transmission', y= 'Selling_Price', ax=ax2[2])
sns.scatterplot(data=df, x='Owner', y= 'Selling_Price', ax=ax2[3])

#add Circles
ax2[0].add_patch(pcs.Ellipse((["Diesel"], 34), 0.2,6,color='red',fill = False, linewidth=2))
ax2[1].add_patch(pcs.Ellipse((["Individual"], 16), 0.1,3,color='red',fill = False, linewidth=2))
ax2[2].add_patch(pcs.Ellipse((["Manual"], 35),0.1,3,color='red',fill = False, linewidth=2))

for ax in ax2:
    ax.grid(True)
    ax.set_axisbelow(True)
    ax.set_facecolor('#e5eefd')

In [None]:
df[(df["Year"] == 2010) & (df["Selling_Price"] > 30) & (df["Kms_Driven"] >30)]

In [None]:
fig3, ax3 = plt.subplots(1,4 , figsize=(20, 5), dpi = 80)

sns.countplot(data=df, x='Fuel_Type', ax=ax3[0])
sns.countplot(data=df, x='Seller_Type', ax=ax3[1])
sns.countplot(data=df, x='Transmission', ax=ax3[2])
sns.countplot(data=df, x='Owner', ax=ax3[3])

for ax in ax3:
    ax.grid(True)
    ax.set_axisbelow(True)
    ax.set_facecolor('#e5eefd')

In [None]:
df.iloc[:,5:].apply(lambda x : print(f'{x.value_counts()}',"\n","-"*20))

In [None]:
df["Age"] = ( df["Year"].max() - df["Year"] ) + 1

In [None]:
Df = df[['Age','Present_Price', 'Kms_Driven','Fuel_Type',
         'Seller_Type', 'Transmission', 'Owner','Selling_Price']]
Df.reset_index(drop=True, inplace=True)
Df

In [None]:
Df.Fuel_Type = Df.Fuel_Type.replace({"Petrol": 2, "Diesel": 3, "CNG": 4})

# Seller_Type
Df.Seller_Type = Df.Seller_Type.replace({"Dealer": 2, "Individual": 3})

# Transmission
Df.Transmission = Df.Transmission.replace({"Manual": 2, "Automatic": 3})

In [None]:
Df.head()

In [None]:
fig4 = px.imshow(Df.corr(), text_auto=True,  aspect="auto")
fig4.show()

In [None]:
MAE = pd.DataFrame(columns=["MAE_train","MAE_test"])
MSE = pd.DataFrame(columns=["MSE_train","MSE_test"])
R2 = pd.DataFrame(columns=["R2_train","R2_test"])

In [None]:
def error_score(y_test,y_train,y_predict_test,y_predict_train):
    
    #calculate the errors and scores
    MAE_train = np.round( metrics.mean_absolute_error(y_train,y_predict_train), 3 )
    MAE_test = np.round( metrics.mean_absolute_error(y_test,y_predict_test), 3 )
    MSE_train = np.round( metrics.mean_squared_error(y_train,y_predict_train), 3 )
    MSE_test = np.round( metrics.mean_squared_error(y_test,y_predict_test), 3 )
    R2_train = np.round( metrics.r2_score(y_train,y_predict_train), 3 )
    R2_test = np.round( metrics.r2_score(y_test,y_predict_test), 3 )
    
    print("MAE train : " , MAE_train)
    print("MSE train : " , MSE_train)
    print("R2 Score train : " , R2_train,"\n")

    print("MAE test : " , MAE_test)
    print("MSE test : " , MSE_test)
    print("R2 Score test : " , R2_test,"\n")
        
    global MAE
    global MSE
    global R2
    
    MAE = MAE._append({"MAE_train": MAE_train , "MAE_test" : MAE_test},ignore_index=True)
    MSE = MSE._append({"MSE_train": MSE_train , "MSE_test" : MSE_test},ignore_index=True)
    R2 = R2._append({"R2_train": R2_train , "R2_test" : R2_test},ignore_index=True)

    #plot
    fig_1, ax_1 = plt.subplots(1,3, figsize=(12, 3), dpi = 80)
    sns.lineplot(data=MAE, ax=ax_1[0], markers=True)
    sns.lineplot(data=MSE, ax=ax_1[1], markers=True)
    sns.lineplot(data=R2, ax=ax_1[2], markers=True)   
    ax_1[0].grid(True)
    ax_1[1].grid(True)
    ax_1[2].grid(True)   
    ax_1[0].set_axisbelow(True)
    ax_1[1].set_axisbelow(True)
    ax_1[2].grid(True)
    ax_1[0].set_facecolor('#e5eefd')
    ax_1[1].set_facecolor('#e5eefd')
    ax_1[2].set_facecolor('#e5eefd')           
    ax_1[0].set_xlabel('MAE')   
    ax_1[1].set_xlabel('MSE')
    ax_1[2].set_xlabel('R2')
    plt.show()

In [None]:
def LinearRegressionModel(df,testSize,k):
    
    #normalizing the data
    global X
    global Y
    
    col_list=list(df.columns)[:-1]
    scaler = MinMaxScaler(feature_range=(2,5))
    norm = scaler.fit_transform(df[col_list])
    norm = pd.DataFrame(norm, columns=col_list)

    X = norm
    Y = df["Selling_Price"].values.reshape(-1,1)
    
    #create model
    x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=testSize, random_state=0)
    model = LinearRegression()
    kfold_val = KFold(k)
    cv_score = cross_val_score(model, x_train,y_train, cv=kfold_val)
    print("cross validation score: ", cv_score,"\n")
    
    #predict
    model.fit(x_train, y_train)
    print("intercept: ",model.intercept_,"\nweight: ",model.coef_,"\n")
    
    y_predict_train = model.predict(x_train)
    y_predict_test = model.predict(x_test)
    compare_test = pd.DataFrame({"Actual_test": y_test.flatten() , "Prediction_test" : y_predict_test.flatten()})
    compare_train = pd.DataFrame({"Actual_train": y_train.flatten() , "Prediction_train" : y_predict_train.flatten()})
    
    #plot
    fig_2, ax_2 = plt.subplots(1,2, figsize=(11,4), dpi = 80)
    sns.scatterplot(data=compare_train, x='Actual_train', y='Prediction_train', ax=ax_2[0])
    sns.scatterplot(data=compare_test, x='Actual_test', y='Prediction_test', ax=ax_2[1])
    ax_2[0].grid(True)
    ax_2[1].grid(True)   
    ax_2[0].set_axisbelow(True)
    ax_2[0].set_facecolor('#e5eefd')
    ax_2[1].set_axisbelow(True)
    ax_2[1].set_facecolor('#e5eefd')        
    ax_2[0].set_xlabel('Actual train')
    ax_2[0].set_ylabel('Predicted train')    
    ax_2[1].set_xlabel('Actual test')
    ax_2[1].set_ylabel('Predicted test')   
    plt.show()
    
    error_score(y_test,y_train,y_predict_test,y_predict_train) 

In [None]:
Df1 = Df.copy()

In [None]:
LinearRegressionModel(Df1,0.1,6)

In [None]:
Df1.insert(0,'Driven_Fuel',Df1["Kms_Driven"]*Df1['Fuel_Type'])

LinearRegressionModel(Df1,0.1,6)

In [None]:
# Add the second term : Present_Price * Age

Df1.insert(0,'PPrice_Age',Df1["Present_Price"]*Df1['Age'])

LinearRegressionModel(Df1,0.1,6)

In [None]:
# Add third term : Age**2

Df1.insert(0, "Age2",Df1["Age"]**2)

LinearRegressionModel(Df1,0.1,6)

In [None]:
LinearRegressionModel(Df1,0.2,6)

In [None]:
LinearRegressionModel(Df1,0.3,6)

In [None]:
MAE = pd.DataFrame(columns=["MAE_train","MAE_test"])
MSE = pd.DataFrame(columns=["MSE_train","MSE_test"])
R2 = pd.DataFrame(columns=["R2_train","R2_test"])

In [None]:
Dff = Df.copy()
Dff=Df.drop(85) # It is identified as an outlier in the preprocessing part.
Dff.reset_index(drop=True,inplace=True)

In [None]:
LinearRegressionModel(Dff,0.1,6)

In [None]:
Dff.insert(0,'Driven_Fuel',Dff["Kms_Driven"]*Dff['Fuel_Type'])

LinearRegressionModel(Dff,0.1,6)


In [None]:
Dff.insert(0,'PPrice_Age',Dff["Present_Price"]*Dff['Age'])

LinearRegressionModel(Dff,0.1,6)

In [None]:
# Add third term : Age**2

Dff.insert(0, "Age2",Dff["Age"]**2)

LinearRegressionModel(Dff,0.1,6)

In [None]:
LinearRegressionModel(Dff,0.2,6)

In [None]:
LinearRegressionModel(Dff,0.3,6)

In [None]:
data = pd.DataFrame({'Age':[8,5,1,13,6,2,12,10,10],
                     'Present_Price':[8.75,10.35,17.85,25.25,13.14,10,24.24,18.85,11.23],
                     'Kms_Driven':[7000,10000,13000,25000,20000,20000,20000,85000,42000],
                     'Fuel_Type':[2,3,2,3,2,2,2,3,4],
                     'Seller_Type':[3,2,2,3,3,2,2,2,3],
                     'Transmission':[3,2,3,2,3,3,3,2,2],
                     'Owner':[2,3,2,4,3,1,3,2,2],
                     'Selling_Price':[1,1,1,1,1,1,1,1,1]})

data.insert(0,'Driven_Fuel',data["Kms_Driven"]*data['Fuel_Type'])
data.insert(0,'PPrice_Age',data["Present_Price"]*data['Age'])
data.insert(0,'Age2',data["Age"]**2)

In [None]:
finall_df=Dff.copy()
final_df=pd.concat([finall_df,data],axis=0)
final_df=final_df.reset_index(drop=True)
colsList=list(final_df.columns)[:-1]
scaler = MinMaxScaler(feature_range=(2,5))
norm = scaler.fit_transform(final_df[colsList])
norm = pd.DataFrame(norm, columns=colsList)
norm=norm.reset_index(drop=True)

In [None]:
x_train=norm[:298]
y_train=Dff['Selling_Price'][:298].values.reshape(-1,1)
x_test=norm[298:]

# Model
FinallModel = LinearRegression()
FinallModel.fit(x_train, y_train)
print("intercept: ",FinallModel.intercept_,"\nweight: ",FinallModel.coef_,"\n")

In [None]:
# Predicting Selling_Price
y_test = FinallModel.predict(x_test)

# Inplacing predicted Selling_Price into the samples dataframe
y_test = np.round(y_test,2)
y_test = pd.DataFrame(y_test,columns=["Selling_Price"])
data['Selling_Price'] = y_test

# Return the data to the original form
data.drop(["Age2","PPrice_Age","Driven_Fuel"],axis=1,inplace=True)
data.Fuel_Type = data.Fuel_Type.replace({2:"Petrol",3: "Diesel",4: "CNG"})
data.Seller_Type = data.Seller_Type.replace({2: "Dealer",3: "Individual"})
data.Transmission = data.Transmission.replace({2: "Manual", 3: "Automatic"})

data