In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt, seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split, KFold, cross_val_score,\
RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
csv = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        csv.append(os.path.join(dirname, filename))
        
csv = [i for i in csv if i not in ['/kaggle/input/used-car-dataset-ford-and-mercedes/unclean cclass.csv',
          '/kaggle/input/used-car-dataset-ford-and-mercedes/unclean focus.csv']]

csv

## Combining all the datasets to one and other data manipulations

In [None]:
df = pd.read_csv(csv[0])
df['company'] = csv[0].split('/')[-1][:-4]
for i in csv[1:]:
    new = pd.read_csv(i)
    new['company'] = i.split('/')[-1][:-4]
    df = pd.concat([df,new], axis=0)
    
df.head()

In [None]:
100*df.isnull().sum()/len(df)

In [None]:
df = df.drop('tax(£)', axis=1)

In [None]:
100*df.isnull().sum()/len(df)

In [None]:
# Missing value imputation
for i in ['tax','mpg']:
    df[i] = df[i].fillna(df[i].median())
    
100*df.isnull().sum()/len(df)  

In [None]:
# Categorical features
cat_vars = df.select_dtypes(include=['object']).columns.to_list() + ['year']
cat_vars

## Skewness Treatment

In [None]:
for i in cat_vars:
    print(df[i].value_counts(),'\n\n')

In [None]:
for i in cat_vars:
    target = pd.DataFrame(df[i].value_counts())
    entries = target[target.iloc[:,0]<0.3*len(target)].index
    df = df[~df[i].isin(entries)]

In [None]:
for i in cat_vars:
    print(df[i].value_counts(),'\n\n')

In [None]:
# Dummy variable creation
df = pd.get_dummies(data=df, columns=cat_vars, drop_first=True, prefix=cat_vars)
df.head()

In [None]:
num_vars = df.select_dtypes(include=['int64','float64']).columns
num_vars

## Treating outliers

In [None]:
df[num_vars].plot(kind='box', subplots=True, layout=(2,3), figsize=(20,8))
plt.show()

In [None]:
for i in num_vars:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3-Q1
    upper = Q3 + (1.5*IQR)
    lower = Q1 - (1.5*IQR)
    df = df[(df[i]>=lower) & (df[i]<=upper)]
    
df[num_vars].plot(kind='box', subplots=True, layout=(2,3), figsize=(20,8))
plt.show()

In [None]:
mask = np.triu(df[num_vars].corr(),1)
sns.heatmap(df[num_vars].corr(), mask=mask, annot=True)
plt.show()

### There is no such high multicollinearity

In [None]:
sns.pairplot(df[num_vars])
plt.show()

## Scaling and splitting of data

In [None]:
train,test = train_test_split(df, train_size=0.7, random_state=100)

In [None]:
scaler = MinMaxScaler()
train[num_vars] = scaler.fit_transform(train[num_vars])
test[num_vars] = scaler.transform(test[num_vars])

In [None]:
X_train = train.drop('price', axis=1)
y_train = train.price

X_test = test.drop('price', axis=1)
y_test = test.price

## Model Building
## Statsmodels Linear Regression

In [None]:
X_add_const = sm.add_constant(X_train)
model = sm.OLS(y_train,X_add_const).fit()
model.summary()

## Linear Regression

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)

lr = LinearRegression()
cv_score = cross_val_score(lr, X_train, y_train, cv=folds, scoring='r2')
cv_score.mean()

## Ridge Regression

In [None]:
ridge = Ridge()
cv_score = cross_val_score(ridge, X_train, y_train, cv=folds, scoring='r2')
cv_score.mean()

In [None]:
hyp = {'alpha':[0.0001,0.001,0.01,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.5,0.7]}
grid_ridge = GridSearchCV(estimator=ridge, param_grid=hyp, cv=folds, n_jobs=-1,
                    scoring='r2', verbose=3, return_train_score=True)
grid_ridge.fit(X_train,y_train)

In [None]:
grid_df = pd.DataFrame(grid_ridge.cv_results_)

plt.plot(grid_df.param_alpha, grid_df.mean_train_score)
plt.plot(grid_df.param_alpha, grid_df.mean_test_score)
plt.grid(alpha=0.7)
plt.show()

In [None]:
grid_ridge.best_score_, grid_ridge.best_estimator_

## Random Forest

In [None]:
rf = RandomForestRegressor()
cv_score = cross_val_score(rf, X_train, y_train, cv=folds, scoring='r2')
cv_score.mean()

## Stacking Regressor

In [None]:
lr = LinearRegression()
models = [('ridge',grid_ridge.best_estimator_),
         ('rf',rf)]
st = StackingRegressor(estimators=models, final_estimator=lr)

In [None]:
cv_score = cross_val_score(st, X_train, y_train, cv=folds, scoring='r2')
cv_score.mean()

In [None]:
st.fit(X_train,y_train)
y_train_pred = st.predict(X_train)
y_test_pred = st.predict(X_test)
r2_score(y_test, y_test_pred)

In [None]:
print('R2 =',r2_score(y_test, y_test_pred))

In [None]:
residual_train = y_train - y_train_pred
residual_test = y_test - y_test_pred
plt.figure(figsize=(20,8))
sns.histplot(residual_train,element='poly',color='crimson',kde=True)
sns.histplot(residual_test,element='poly',color='green',kde=True)
plt.title('Errors distrobution',fontsize=20)
plt.show()



plt.figure(figsize=(20,8))
plt.axhline(y=0, linewidth=3, color='red')
plt.scatter(residual_train[:500].index,residual_train[:500],label='train residuals')
plt.scatter(residual_test[:500].index,residual_test[:500],label='test residuals')
plt.xlabel('Index', fontsize=16)
plt.ylabel('Error', fontsize=16)
plt.legend(fontsize=15)
plt.show()

### Error terms are randomly distributed

In [None]:
preds = pd.DataFrame({'actual':y_test, 'pred':y_test_pred})
preds = preds.sort_index().reset_index(drop=True)

plt.figure(figsize=(20,8))
plt.plot(preds.index[:200],preds.actual[:200],label='actual values')
plt.plot(preds.index[:200],preds.pred[:200],label='predicted values')
plt.xlabel('Index',fontsize=16)
plt.ylabel('Price',fontsize=16)
plt.legend(fontsize=15)
plt.show()

In [None]:
RMSE = (mean_squared_error(y_test,y_test_pred))**0.5
print('RMSE =',RMSE)