# Germany - Cars Price Prediction

## Main Objective

The main objective of this analysis is to **predict** the cost of a cars based on data gathered from germany.

## Data and Feature Description

This dataset whas downloaded from Kaggle, and it is a dataset containing a list of cars that are selling in Germany contains 46.405 rows, with 9 features.

*   Number of Categorical Features - 5
*   number of Numeric Features - 4



## Data Exploration and Data Cleaning

1. Dealing with null values
2. Age Feature to years old
3. EDA
4. Feature Correlation
5. Dealing with the most important features outliers (Price, HP and Mileage)
6. One Hot Encoded categorical features

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports and Data Upload

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport

import warnings

warnings.filterwarnings("ignore")

In [None]:
GCARS_PATH = '/kaggle/input/cars-germany'

def load_german_cars(gcars_path=GCARS_PATH):
  csv_path = os.path.join(gcars_path, 'autoscout24-germany-dataset.csv')
  return pd.read_csv(csv_path)

cars = load_german_cars()

In [None]:
%matplotlib inline

In [None]:
cars.head()

# Data Cleaning, Analysis and Feature Engineering

## Dealing with Null Values

In [None]:
cars.describe()

In [None]:
cars.head()

In [None]:
cars.isnull().values.any()

In [None]:
cars = cars.dropna()

In [None]:
cars.isnull().sum()

In [None]:
cars.shape[0]

In [None]:
cars = cars.drop_duplicates(keep='first')

In [None]:
cars.shape[0]

## Age Feature

In [None]:
from datetime import datetime

In [None]:
cars['age'] = datetime.now().year - cars['year']

cars.drop('year', axis=1, inplace=True)

cars.head()

## EDA

In [None]:
mediana = cars.price.median()
mediana

In [None]:
media = cars.price.mean()
media

In [None]:
abaixo_mediana = cars.query("price<10990")
q_abaixo_mediana = abaixo_mediana.value_counts().sum()

acima_mediana = cars.query("price > 10990.1")
q_acima_mediana = acima_mediana.value_counts().sum()

print(f'Mediana = {mediana}')
print('Quantidade de carros com valores ACIMA da mediana')
print(q_acima_mediana)
print('Quantidade de carros com valores ABAIXO da mediana')
print(q_abaixo_mediana)
print('--------------------------------------------')

abaixo_media = cars.query("price<16546")
q_abaixo_media = abaixo_media.value_counts().sum()

acima_media = cars.query("price > 16546.1")
q_acima_media = acima_media.value_counts().sum()

print(f'Media = {media}')
print('Quantide de carros com valores ACIMA da media')
print(q_acima_media)
print('Quantidade de carros com valores ABAIXO da media')
print(q_abaixo_media)

In [None]:
sns.scatterplot(x=cars['hp'], y=cars['price'])

In [None]:
fig = plt.figure(figsize=(20,15))
ax = plt.axes(projection='3d')

hp = cars['hp']
age = cars['age']
price = cars['price']

ax.scatter3D(age, hp, price, c=cars['age'], s=1)
ax.set_xlabel('Age')
ax.set_ylabel('HP')
ax.set_zlabel('Price')

In [None]:
cars['fuel'] = cars['fuel'].replace('Diesel', 0)
cars['fuel'] = cars['fuel'].replace('Gasoline', 1)
cars['fuel'] = cars['fuel'].replace(['Electric/Gasoline', 'Electric/Diesel', 'Electric'],  2)
cars['fuel'] = cars['fuel'].replace(['CNG', 'LPG', 'Others', '-/- (Fuel)', 'Ethanol', 'Hydrogen'], 3)

In [None]:
cars['fuel'].unique()

In [None]:
eletricos = cars.query('fuel == 2')
eletricos.head()

In [None]:
eletricos['price'].mean()

In [None]:
fig = plt.figure()
ax = plt.axes(projection='3d')

hp = cars['hp']
age = cars['age']
price = cars['price']

ax.scatter3D(age, hp, price, c=cars['age'])
ax.set_xlabel('Age')
ax.set_ylabel('HP')
ax.set_zlabel('Price')

## Correlation

In [None]:
plt.figure(figsize=(14,7))
sns.heatmap(cars.corr(),annot=True, cmap='coolwarm')

## Outliers

### Price Outliers

In [None]:
sns.scatterplot(x=cars['hp'], y=cars['price'])

In [None]:
min_price, max_price = cars.price.quantile([0.01, 0.99])
min_price, max_price

In [None]:
cars[cars['price']>max_price].value_counts().sum()

In [None]:
cars[cars['price']>max_price]

In [None]:
cars[cars['price']<min_price].value_counts().sum()

In [None]:
cars[cars['price']<min_price]

In [None]:
pop_cars = cars[(cars.price<max_price) & (cars.price>min_price)]
print('Total number of cars:')
print(cars.shape[0])
print('---------------------')
print('Numers of cars that are abore $3.300,0 and below $99.999,0')
print(pop_cars.shape[0])

In [None]:
sns.scatterplot(x=pop_cars['hp'], y=pop_cars['price'])

### Mileage Outliers

In [None]:
sns.scatterplot(x=pop_cars['hp'], y=pop_cars['price'])

In [None]:
sns.scatterplot(x=pop_cars['mileage'], y=pop_cars['price'])

In [None]:
min_price, max_price = cars.mileage.quantile([0.01, 0.99])
min_price, max_price

In [None]:
pop_cars = pop_cars[(pop_cars.mileage<max_price) & (pop_cars.mileage>min_price)]

In [None]:
sns.scatterplot(x=pop_cars['mileage'], y=pop_cars['price'])

### HP Outliers

In [None]:
sns.scatterplot(x=pop_cars['hp'], y=pop_cars['price'])

In [None]:
min_price, max_price = cars.hp.quantile([0.01, 0.999])
min_price, max_price

In [None]:
pop_cars = pop_cars[(pop_cars.hp<max_price) & (pop_cars.hp>min_price)]

In [None]:
sns.scatterplot(x=pop_cars['hp'], y=pop_cars['price'])

## Giving names to the feature Fuel again

In [None]:
pop_cars['fuel'] = pop_cars['fuel'].replace(0, 'Diesel')
pop_cars['fuel'] = pop_cars['fuel'].replace(1, 'Gasoline')
pop_cars['fuel'] = pop_cars['fuel'].replace(2, 'Electric')
pop_cars['fuel'] = pop_cars['fuel'].replace(3, 'Others')

In [None]:
pop_cars = pop_cars.reset_index(drop=True)

## One Hot Encoding and KFolds

In [None]:
pop_cars = pop_cars.drop(columns=['make', 'model'], axis=1)
pop_cars.head()

In [None]:
pop_cars.dtypes.value_counts()

In [None]:
mask = pop_cars.dtypes == np.object
categorical = pop_cars.columns[mask]
categorical

In [None]:
num_ohc_cols = (pop_cars[categorical].apply(lambda x: x.nunique()).sort_values(ascending=False))
num_ohc_cols

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data_ohc = pop_cars.copy()

ohc = OneHotEncoder()

for col in num_ohc_cols.index:
  #this is a sparse array
  new_dat = ohc.fit_transform(data_ohc[[col]])
  #drop original column from original DF
  data_ohc = data_ohc.drop(col, axis=1)
  #get unique names of columns
  cats = ohc.categories_
  #create a column for each OHE column by value
  new_cols = ['_'.join([col,cat]) for cat in cats[0]]
  #create the new Dataset
  new_df = pd.DataFrame(new_dat.toarray(), columns=new_cols)
  #append new data to df
  data_ohc=pd.concat([data_ohc, new_df], axis=1)

y_col = 'price'

feature_cols = [x for x in data_ohc.columns if x != y_col]

X = data_ohc[feature_cols]
y = data_ohc[y_col]

In [None]:
X.head()

In [None]:
from sklearn.model_selection import KFold

kf = KFold(shuffle=True, random_state=72018, n_splits=3)

kf.split(X)

This creates a Tuple, for 3 different scenarios(n_plits), that is: train_index, test_index

In [None]:
for train_index, test_index in kf.split(X):
  print("Train index:", train_index[:10], len(train_index))
  print("Test index:", test_index[:10], len(test_index))
  print('')

# Modeling

## Linear regressions Without Regularization

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

scores = []
lr = LinearRegression()

for train_index, test_index in kf.split(X):
  X_train, X_test, y_train, y_test = (X.iloc[train_index, :], X.iloc[test_index, :], y[train_index], y[test_index])
  
  lr.fit(X_train, y_train)

  y_pred = lr.predict(X_test)

  score = r2_score(y_test.values, y_pred)

  scores.append(score)

scores

## Linear Regression With Regularization

Now lets do the same but scaling our data as we go through the folds

In [None]:
from sklearn.preprocessing import StandardScaler

scores = []
lr = LinearRegression()
s = StandardScaler()

for train_index, test_index in kf.split(X):
  X_train, X_test, y_train, y_test = (X.iloc[train_index, :], X.iloc[test_index, :], y[train_index], y[test_index])
  
  X_train_s = s.fit_transform(X_train)
  
  lr.fit(X_train_s, y_train)

  X_test_s = s.transform(X_test)

  y_pred = lr.predict(X_test_s)

  score = r2_score(y_test.values, y_pred)

  scores.append(score)

scores

## Linear Regression with Regu, Pipeline com Cross Val Predict

Fazer o que fizemos em cima com o Pipeline

In [None]:
from sklearn.pipeline import Pipeline

estimator = Pipeline([('scaler', s), ('linear_reg', lr)])

estimator.fit(X_train, y_train)

estimator.predict(X_test)

In [None]:
kf

In [None]:
from sklearn.model_selection import cross_val_predict

predictions = cross_val_predict(estimator, X, y, cv=kf, verbose=100)

In [None]:
len(predictions)

In [None]:
r2_score(y, predictions)

In [None]:
np.mean(scores)

We can see that this is almost the same. Linear Regression doestn change much with Regularization

## Linear Regression with Polynomial Regularization

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold

polis = [2, 3, 4]

lr1 = LinearRegression()

scores = []

for poli in polis:
  pf = PolynomialFeatures(poli)

  estimator = Pipeline([('make_higher_degree', pf), ('linear_reg', lr1)])

  predictions = cross_val_predict(estimator, X, y, cv=kf, verbose=100)

  score = r2_score(y, predictions)

  scores.append(score)

In [None]:
list(zip(polis, scores))

## Lasso with Regularization

In [None]:
import numpy as np

alphas = np.geomspace(1e-9, 1e-0, num=10)
alphas

from sklearn.linear_model import Lasso


scores_lasso = []
coefs = []

for alpha in alphas:
  las = Lasso(alpha=alpha, max_iter=100000)

  estimator = Pipeline([('scaler', s), ('lasso_regression', las)])

  predictions = cross_val_predict(estimator, X, y, cv=kf, verbose=100)

  score = r2_score(y, predictions)

  scores.append(score)

In [None]:
list(zip(alphas, scores))

In [None]:
Lasso(alpha=1e-6).fit(X,y).coef_

In [None]:
Lasso(alpha=1).fit(X,y).coef_

## Lasso with Polynomial

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold

pf = PolynomialFeatures(degree=3)

scores = []
alphas = np.geomspace(9, 12, 4)
# alphas = np.geomspace(0.001, 10, 5)

for alpha in alphas:
  las = Lasso(alpha=alpha, max_iter=10000)

  estimator = Pipeline([('make_higher_degree', pf), ('scaler', s), ('lasso_regression', las)])

  predictions = cross_val_predict(estimator, X, y, cv=kf, verbose=100)

  score = r2_score(y, predictions)

  scores.append(score)

In [None]:
alphas

In [None]:
scores

In [None]:
import matplotlib.pyplot as plt

plt.semilogx(alphas, scores)

Lasso with the best estimator from above

In [None]:
best_estimator = Pipeline([('make_higher_degree', PolynomialFeatures(degree=2)), ('scaler', s), ('lasso_regression', Lasso(alpha=10, max_iter=100000))])

best_estimator.fit(X,y)
best_estimator.score(X,y)

In [None]:
best_estimator.named_steps['lasso_regression'].coef_

In [None]:
plt.figure(figsize=(10,6))
plt.semilogx(alphas, scores, '-o')
plt.xlabel('alphas')
plt.ylabel('Rˆ2')

## Ridge

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.linear_model import Ridge

pf = PolynomialFeatures(degree=2)

alphas = np.geomspace(0.1,2,20)

scores=[]

for alpha in alphas:
  ridge = Ridge(alpha=alpha, max_iter=100000)

  estimator = Pipeline([('make_higher_degree', pf), ('scaler', s), ('ridge', ridge)])

  predictions = cross_val_predict(estimator, X, y, cv=kf, verbose=100)
  score=r2_score(y, predictions)
  scores.append(score)

In [None]:
plt.plot(alphas, scores)

In [None]:
best_estimator = Pipeline([('make_higher_degree', PolynomialFeatures(degree=2, include_bias=False)), ('scaler', s), ('lasso_regression', Lasso(alpha=10))])

best_estimator.fit(X,y)
best_estimator.score(X,y)

In [None]:
fs_importances = pd.DataFrame(zip(best_estimator.named_steps['make_higher_degree'].get_feature_names(input_features=X.columns), best_estimator.named_steps['lasso_regression'].coef_,))

In [None]:
fs_importances.sort_values(by=1)

## GridSearchCV

To do cross-validation, we used two techniques:



*   use KFolds and manually create a loop to do cross validation
*   Use  cross_val_predict and score in a couple of lines

To do Hyper-parameters tuning, we see a general pattern:

*   use cross_val_predict and score in a manually written loop over hyperparameters, then select the best one

Perhaps not surprisingly, there is a function that does this for us -- 

```
GridSearchCV()
```

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

estimator = Pipeline([('polynomial_features', PolynomialFeatures()), ('scaler', s), ('ridge_regression', Ridge())])

params = {'polynomial_features__degree': [1,2,3], 
          'ridge_regression__alpha': np.geomspace(4, 20, 30)}

grid = GridSearchCV(estimator, params, cv=kf)

In [None]:
grid.fit(X,y)

In [None]:
grid.best_score_, grid.best_params_

In [None]:
y_predict = grid.predict(X)

In [None]:
r2_score(y, y_predict)

In [None]:
grid.best_estimator_.named_steps['ridge_regression'].coef_

In [None]:
pd.DataFrame(grid.cv_results_)

## Ridge CV

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(ytrue, ypredict):
  return np.sqrt(mean_squared_error(ytrue, ypredict))

In [None]:
from sklearn.linear_model import RidgeCV

alphas = [0.001, 0.003, 0.005, 0.05, 0.1, 0.3, 1, 2, 5, 10]

ridgeCV = RidgeCV(alphas=alphas, cv=4).fit(X_train, y_train)

ridgeCV_rmse = rmse(y_test, ridgeCV.predict(X_test))

print(ridgeCV.alpha_, ridgeCV_rmse)

## Lasso CV

In [None]:
from sklearn.linear_model import LassoCV

alphas2 = [0.005, 0.05, 0.1, 0.3, 1, 2, 5, 10, 20, 50, 70, 100, 120]

lassoCV = LassoCV(alphas=alphas2, max_iter=100000, cv=4).fit(X_train, y_train)

lassoCV_rmse = rmse(y_test, lassoCV.predict(X_test))

print(lassoCV.alpha_, lassoCV_rmse)

## ElasticNetCV

In [None]:
from sklearn.linear_model import ElasticNetCV

alphas3 = [0.05, 0.1, 0.3, 1, 2, 5, 10, 20, 50]
l1_ratios = np.linspace(0.1, 0.9, 9)

elasticNetCV = ElasticNetCV(alphas=alphas3, l1_ratio=l1_ratios, max_iter=100000, cv=4).fit(X_train, y_train)

elasticNetCV_rmse = rmse(y_test, elasticNetCV.predict(X_test))

print(elasticNetCV.alpha_, elasticNetCV.l1_ratio_, elasticNetCV_rmse)

You are going to choose between these 3 above for this different reasons:


*   LassoCV - Reduce coeficients and do feature selection
*   RigdeCV - If you want to run it quicly
*   ElasticNetCV - If you want a ballance out of the two

