In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
data = pd.read_csv("CarPrice_Assignment.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.columns.to_list()

In [None]:
CompanyName = data['CarName'].apply(lambda x : x.split(' ')[0])
data.insert(3,"CompanyName",CompanyName)
data.drop(['CarName'],axis=1,inplace=True)
data.head()

In [None]:
data["CompanyName"].value_counts()

In [None]:
data.CompanyName = data.CompanyName.str.lower()

def replace_name(a,b):
    data.CompanyName.replace(a,b,inplace=True)

replace_name('maxda','mazda')
replace_name('porcshce','porsche')
replace_name('toyouta','toyota')
replace_name('vokswagen','volkswagen')
replace_name('vw','volkswagen')

data.CompanyName.unique()

In [None]:
data['enginelocation'].unique()

In [None]:
data.dtypes

In [None]:
cat_cols = data.select_dtypes(include="object")
cat_cols.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Specify the categorical columns
cat_cols = ['CompanyName', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem']

# Create bar plots
for col in cat_cols:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=col, y='price', data=data)
    plt.title(f'Bar Plot of {col} vs. Price')
    plt.xlabel(col)
    plt.ylabel('Price')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
num_cols=data.select_dtypes(include=['int64','float64'])
num_cols.columns

In [None]:
def pp(x,y,z):
    sns.pairplot(data, x_vars=[x,y,z], y_vars='price',height=4, aspect=1, kind='scatter')
    plt.show()

pp('boreratio','enginesize',  'stroke')
pp('compressionratio', 'horsepower', 'peakrpm')
pp('wheelbase', 'citympg', 'highwaympg')

In [None]:

def scatter(x,fig):
    plt.subplot(5,2,fig)
    plt.scatter(data[x],data['price'])
    plt.title(x+' vs Price')
    plt.ylabel('Price')
    plt.xlabel(x)

plt.figure(figsize=(10,20))

scatter('carlength', 1)
scatter('carwidth', 2)
scatter('carheight', 3)
scatter('curbweight', 4)

plt.tight_layout()

In [None]:
d1=data[['symboling', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'horsepower', 'citympg', 'highwaympg',
       'price','fueltype','aspiration','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']]
varlist=['fueltype','aspiration','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']
dummies = pd.get_dummies(d1[varlist])
d1 = pd.concat([d1.drop(varlist, axis=1), dummies], axis=1)

d1.head()

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(0)
df_train, df_test = train_test_split(d1, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_vars = ['symboling', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'horsepower', 'citympg', 'highwaympg','price']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
df_train.head()


In [None]:
df_train.describe()

In [None]:
plt.figure(figsize = (30, 25))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()

In [None]:
y_train = df_train.pop('price')
X_train = df_train

In [None]:
X_train.shape

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)
rfe = RFE(lm, 20)
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe

In [None]:
X_train.columns[rfe.support_]

In [None]:
X_train_rfe = X_train[X_train.columns[rfe.support_]]
X_train_rfe.head()

In [None]:
def build_model(X,y):
    X = sm.add_constant(X) 
    lm = sm.OLS(y,X).fit() 
    print(lm.summary()) 
    return X
    
def checkVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

In [None]:
X_train_new = build_model(X_train_rfe,y_train)

In [None]:
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["curbweight"], axis = 1)


In [None]:
X_train_new = build_model(X_train_new,y_train)

In [None]:
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["carbody_convertible"], axis = 1)


In [None]:
X_train_new = build_model(X_train_new,y_train)

In [None]:
X_train_new = X_train_new.drop(["enginetype_dohcv"], axis = 1)

In [None]:
X_train_new = build_model(X_train_new,y_train)

In [None]:
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["enginelocation_rear"], axis = 1)
X_train_new = build_model(X_train_new,y_train)

In [None]:
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["enginetype_rotor"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["enginesize"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["enginetype_ohcv"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["cylindernumber_twelve"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["aspiration_turbo"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["cylindernumber_three"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["cylindernumber_two"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["fuelsystem_spdi"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
lm = sm.OLS(y_train,X_train_new).fit()
y_train_price = lm.predict(X_train_new)

In [None]:
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)      

In [None]:
num_vars = ['symboling', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'horsepower', 'citympg', 'highwaympg','price']
df_test[num_vars] = scaler.transform(df_test[num_vars])

In [None]:
df_test.describe()

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
X_train_new = X_train_new.drop('const',axis=1)
X_test_new = X_test[X_train_new.columns]
X_test_new = sm.add_constant(X_test_new)

In [None]:
y_pred = lm.predict(X_test_new)

In [None]:
from sklearn.metrics import r2_score 
r2_score(y_test, y_pred)

In [None]:
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=16)   

In [None]:
print(lm.summary())

R-squared value is 0.840