In [None]:
import numpy as np
import pandas as pd
# import visualizations package
import matplotlib.pyplot as plt
import seaborn as sns
# import profiling package
import ydata_profiling as pp

from scipy import stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import category_encoders as ce
#set of techniques used in statistics and data analysis to stabilize variance and make the data more normally distributed or to make the relationship b/n variables more linear
from sklearn.preprocessing import PowerTransformer 
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import het_white
import statsmodels.stats.api as sms
from statsmodels.stats import diagnostic as diag
from statsmodels.compat import lzip
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
data=pd.read_csv('train-data.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.sort_values('Seats', ascending=True).head(5)

In [7]:
data = data[~(data['Kilometers_Driven'] > 400000)]
data = data[~(data['Mileage'] == '0.0 kmpl')]
data = data[~(data['Seats'] == 0.0)]

In [None]:
data.nunique()

In [None]:
data.drop([data.columns[0], 'Name','New_Price'], axis=1, inplace=True)
data.head()

In [10]:
data.dropna(inplace=True)
y_train=data.iloc[:,-1]

In [None]:
data_cat=data[data.columns[[0,1,3,4,5,9]]]
data_cat

In [12]:
#data_cat=data.iloc[:,[0,1,3,4,5,9]]
data_num=data.iloc[:,[2,6,7,8]]
data=pd.concat([data,y_train],axis=1)

In [None]:
# iterating through each column in the data_cat DataFrame and grouping by the unique values in column
for i in data_cat.columns:                                           
    print(data.groupby(i)['Price'].mean())

In [None]:
data_num

In [None]:
# split the values in columns of the data_num by whitespace and extracting the second part of the split. Subsequently, it counts the occurrences of each extracted value
print(data_num['Mileage'].str.split().str[1].value_counts())
print(data_num['Engine'].str.split().str[1].value_counts())
print(data_num['Power'].str.split().str[1].value_counts())

In [16]:
mil=[]
for i in data_num.Mileage:
    if str(i).endswith('km/kg'):
        val=i[:-6]
        val=float(val)*1.33
        mil.append(float(val))
    else:
        val=i[:-5]
        val=float(val)
        mil.append(float(val))

In [17]:
data_num['Mileage']=mil

In [18]:
data_num['Engine']=data_num['Engine'].str.split().str[0]
data_num['Power']=data_num['Power'].str.split().str[0]

In [19]:
data_num['Power']=data_num['Power'].replace('null', np.mean(pd.to_numeric(data_num['Power'], errors='coerce')))

In [20]:
data_num['Engine']=data_num['Engine'].astype(float)
data_num['Power']=data_num['Power'].astype(float)

In [None]:
data_num.info()

In [None]:
#to create a grid of subplots(6 rows).Each row represents a different categorical column from the data_cat DataFrame.
# The count of each unique value in each categorical column is plotted.
fig,axes = plt.subplots(nrows=6, figsize=(25,60))
for i,j in zip(data_cat.columns, range(0,6,1)):
    sns.countplot(x=data_cat[i], ax=axes[j])

In [None]:
fig,axes = plt.subplots(nrows=6, figsize=(25,60))
for i,j in zip(data_cat.columns, range(0,6,1)):
    sns.violinplot(x=data_cat[i], y=y_train, ax=axes[j])

In [None]:
fig,axes = plt.subplots(nrows=4, figsize=(10,30))
for i,j in zip(data_num.columns, range(0,4,1)):
    sns.regplot(x=data_num[i], y=y_train, scatter_kws={'s':10}, ax=axes[j])

In [25]:
data_cat['Fuel_Type'].replace(to_replace=['CNG','LPG'],value='CNG/LPG',inplace=True)

In [None]:
l1 = [*range(1998, 2020, 1)]
l2 = [*range(1, 23, 1)]
year_dict = dict(zip(l1, l2))
print(year_dict)

In [27]:
ordinal_cols_mapping = [
    {"col":"Owner_Type",    "mapping": {'First' : 1, 'Second' : 2, 'Third' : 3, 'Fourth & Above':4}}
    ,{"col":"Seats",    "mapping": {0.0 : 0, 2.0 : 2, 4.0 : 4, 5.0 : 5, 6.0 : 6, 7.0 : 7, 8.0 : 8, 9.0 : 9, 10.0 : 10}} 
    ,{"col":"Year",    "mapping": year_dict},
]

encoder = ce.ordinal.OrdinalEncoder(mapping = ordinal_cols_mapping,return_df = True)  
data_cat = encoder.fit_transform(data_cat)

In [None]:
#OneHotwEncoding with feature name as labels
categoryVariableList = ['Location', 'Fuel_Type', 'Transmission']

ohe = OneHotEncoder(categories='auto', drop='first', handle_unknown='ignore')
feature_arr = ohe.fit_transform(data_cat[categoryVariableList]).toarray()
feature_labels = ohe.get_feature_names_out(categoryVariableList)

# feature_labels = np.array(feature_labels).ravel()

features = pd.DataFrame(feature_arr, columns=feature_labels)
features

In [None]:
data_cat

In [30]:
data_cat=pd.concat([data_cat.reset_index(drop=True), features], axis=1)
data_cat.drop(columns=categoryVariableList, axis=1, inplace=True)

In [None]:
data_cat

In [None]:
data_num.reset_index(drop=True)

In [33]:
# applying box-cox transformer
#This is a family of power transformations that generalize both the square root and logarithm transformations.
# The Box-Cox transformation can handle different power values, and the optimal value of the power parameter is typically determined through maximum likelihood estimation

pt = PowerTransformer(method='box-cox')
data_num2 = pt.fit_transform(data_num+0.0000001)
pd.DataFrame({'cols':data_num.columns,'box_cox_lambda':pt.lambdas_})
data_num_trans = pd.DataFrame(data_num2,columns=data_num.columns)

In [34]:
X_train = pd.concat([data_num_trans.reset_index(drop=True), data_cat.reset_index(drop=True)], axis=1)
Y_train = y_train.reset_index(drop=True)
model_data = pd.concat([X_train,Y_train], axis=1)

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train,Y_train,test_size=0.2,random_state=42)

In [None]:
for col in X_train.columns:
    plt.figure(figsize=(18,4))
    plt.subplot(131)
    sns.distplot(X_train[col])
    plt.title(col)
    
    plt.subplot(132)
    stats.probplot(X_train[col],dist ='norm',plot = plt)
    plt.title(col)
    
    plt.subplot(133)
    sns.regplot(x=X_train[col],y=Y_train, scatter_kws={'s':10})
    plt.title(col)
    
    plt.show()

In [None]:
X_train

In [38]:
a=X_train.columns.drop(['Seats','Year','Fuel_Type_Diesel','Engine'])
a
b=X_train[a]

In [39]:
# Unlike sklearn that adds an intercept to our data for the best fit, statsmodel doesn't. We need to add it ourselves.
# Remember, we want to predict the price based off our features.
# X_train represents our predictor variables, and y our predicted variable.
# We need now to add manually the intercepts

In [None]:
 # The add_constant function from statsmodels adds a constant term (intercept) to the exogenous variables (independent variables). 
 # This is required when performing linear regression using the Ordinary Least Squares (OLS) method.
X_endog = sm.add_constant(b)  

# The sm.OLS function initializes a model for ordinary least squares (OLS) regression. 
# Here:Y_train.ravel() is the dependent variable (response variable) which is flattened (if it's a 2D array) to ensure it's in the correct shape.
# X_endog contains the independent variables (features) with an added constant term                            
res = sm.OLS(Y_train.ravel(), X_endog)

# The fit method computes the OLS regression model. By specifying cov_type='HC1', you're also requesting robust standard errors for the estimated coefficients. The 'HC1' option 
# refers to the heteroskedasticity-consistent covariance matrix estimator, which is robust to certain violations of classical OLS assumptions, such as heteroskedasticity.
model=res.fit(cov_type='HC1')
model.summary()

In [None]:
plt.figure(figsize=(18,18))
sns.heatmap(b.corr(),annot=True,cmap='RdYlGn')
plt.show()

In [None]:
# calculates the Variance Inflation Factor (VIF) for each feature in the dataframe b. VIF is used to detect multicollinearity in regression analysis.
vif_data = pd.DataFrame()
vif_data["feature"] = b.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(b.values, i) for i in range(len(b.columns))]

vif_data.sort_values('VIF', ascending=False)

In [None]:
b

In [44]:
residuals=model.resid

In [None]:
mean_residuals = np.mean(residuals)
print("Mean of Residuals {}".format(mean_residuals))

In [46]:
# The het_breuschpagan function conducts the Breusch-Pagan test, which is a test for heteroskedasticity in the residuals of a regression model.
# Heteroskedasticity occurs when the variance of the errors is not constant across all levels of the independent variables.
X_endog = sm.add_constant(X_train)
bp_test = het_breuschpagan(model.resid, X_endog)

In [47]:
white_test = het_white(model.resid,  model.model.exog)

In [None]:
labels = ['LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value']
print(dict(zip(labels, bp_test)))
print(dict(zip(labels, white_test)))

In [None]:
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)

In [None]:
sns.distplot(residuals,kde=True)

In [None]:
#autocorrelation
sm.graphics.tsa.plot_acf(residuals, lags=40)
plt.show()

In [None]:
# partial autocorrelation
sm.graphics.tsa.plot_pacf(residuals, lags=40)
plt.show()

In [None]:
min(diag.acorr_ljungbox(residuals , lags = 40)['lb_pvalue'])

In [None]:
X_train

In [55]:
lr = LinearRegression()

In [None]:
lr.fit(X_train,Y_train)

In [57]:
Y_pred = lr.predict(X_test)
Y_pred_train = lr.predict(X_train)

In [None]:
print("Test accuracy:", r2_score(Y_test,Y_pred))
print("Train accuracy:", r2_score(Y_train,Y_pred_train))

In [None]:
lr.coef_