Dear readers, this is latest version of Mercedec Benz Greener Manufacturing Notebook. You can run the below codes on Python, without changin anything. I copied this notebook from [Podsyp](http://www.kaggle.com/podsyp/mercedes-benz-greener-manufacturing). It is very well written and easy to understand. I'll try to touch some points as well. The result is 0.55259

# 0. Starts

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Import

In [None]:
import pandas_summary as ps

# Data processing, metrics and modeling
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV, RidgeCV, HuberRegressor, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics


from scipy.stats import norm

# Lgbm
import lightgbm as lgb

# Support warnings
import warnings
warnings.filterwarnings("ignore")

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams

Since there are many number of features, we have to extend it.

# 2. Options

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

folder = '/kaggle/input/mercedes-benz-greener-manufacturing/'

# 3. Read CSV

In [None]:
train_df = pd.read_csv(folder + 'train.csv.zip')
test_df = pd.read_csv(folder + 'test.csv.zip')
sub_df = pd.read_csv(folder + 'sample_submission.csv.zip')

print('train_df: ', train_df.shape)
print('test_df: ', test_df.shape)
print('sub_df: ', sub_df.shape)

In [None]:
train_df.head()

We need to see which variables are categorical and which ones are numerical, later we will transfer categorical variables to numerical.

In [None]:
dfs_train = ps.DataFrameSummary(train_df)
print('categoricals: ', dfs_train.categoricals.tolist())
print('numerics: ', dfs_train.numerics.tolist())
dfs_train.summary()

In [None]:
cat_cols = dfs_train.categoricals.tolist()

In [None]:
test_df.head()

In [None]:
dfs_test = ps.DataFrameSummary(test_df)
print('categoricals: ', dfs_test.categoricals.tolist())
print('numerics: ', dfs_test.numerics.tolist())
dfs_test.summary()

# 4. Get target

In [None]:
ps.DataFrameSummary(train_df[['y']]).summary().T

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(train_df['y'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_df['y'])

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution');

### Let's logarithm the value of Y

In [None]:
train_df['y'] = np.log(train_df['y'])

In [None]:
ps.DataFrameSummary(train_df[['y']]).summary().T

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(train_df['y'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_df['y'])

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution');

# 5. Drop outliers by percentile

In [None]:
train_df = train_df[(train_df['y'] > np.percentile(train_df['y'], 0.5)) & (train_df['y'] < np.percentile(train_df['y'], 99.5))]

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(train_df['y'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_df['y'])

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution');

In [None]:
y = train_df['y']
train_df.drop(['y'], axis=1, inplace=True)

In [None]:
# VERY BAD So, as we can see the distribution is not normal.

We have some features (X0-X8) which are categorical, we need to trasnfer them into numerical using Mean Target Encoding. As we see our target feature is not a dummy variable, so we can not use dummy option. For more information about MTE read this [link](http://medium.com/@shailypa/target-encoding-cd3e9c14fcc)

## 6. Make mean target encoding for categorical feature

Let us consider the above table (A simple binary classification). 

$$ MeanTargetEnc_i = {((GlobalMean * C) + (Mean_i * Size)) \over (C + Size)} $$

Instead of finding the mean of the targets, we can also focus on median and other statistical correlations….These are broadly called target encodings

In [None]:
class MeanEncoding(BaseEstimator):
    """   In Mean Encoding we take the number 
    of labels into account along with the target variable 
    to encode the labels into machine comprehensible values    """
    
    def __init__(self, feature, C=0.1):
        self.C = C
        self.feature = feature
        
    def fit(self, X_train, y_train):
        
        df = pd.DataFrame({'feature': X_train[self.feature], 'target': y_train}).dropna()
        
        self.global_mean = df.target.mean()
        mean = df.groupby('feature').target.mean()
        size = df.groupby('feature').target.size()
        
        self.encoding = (self.global_mean * self.C + mean * size) / (self.C + size)
    
    def transform(self, X_test):
        
        X_test[self.feature] = X_test[self.feature].map(self.encoding).fillna(self.global_mean).values
        
        return X_test
    
    def fit_transform(self, X_train, y_train):
        
        df = pd.DataFrame({'feature': X_train[self.feature], 'target': y_train}).dropna()
        
        self.global_mean = df.target.mean()
        mean = df.groupby('feature').target.mean()
        size = df.groupby('feature').target.size()
        self.encoding = (self.global_mean * self.C + mean * size) / (self.C + size)
        
        X_train[self.feature] = X_train[self.feature].map(self.encoding).fillna(self.global_mean).values
        
        return X_train

In [None]:
for f in cat_cols:
    me = MeanEncoding(f, C=0.99)
    me.fit(train_df, y)
    train_df = me.transform(train_df)
    test_df = me.transform(test_df)

In [None]:
train_df.head()

# 7. Cluster stratify split

In [None]:
km = KMeans(n_clusters=2, random_state=13)
km.fit(pd.DataFrame(y))
y_clust = km.predict(pd.DataFrame(y))

In [None]:
pd.Series(y_clust).value_counts(normalize=True)

In [None]:
X_train, X_val, y_train, y_val, y_train_clust, y_val_clust = train_test_split(
    train_df, y, pd.Series(y_clust), 
    test_size=0.25,
    stratify=y_clust,
    random_state=777
)

In [None]:
y_train_clust.value_counts(normalize=True)

# 8. Scailing

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = pd.DataFrame(scaler.transform(X_train))
X_val_sc = pd.DataFrame(scaler.transform(X_val))
test_df_sc = pd.DataFrame(scaler.transform(test_df))

# 9. Visualize our dateset

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train_sc)
train_pca_transformed = pca.transform(X_train_sc)

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train_pca_transformed[:, 0], train_pca_transformed[:, 1], c=y_train_clust);

# 10. Models

In [None]:
lasso = LassoCV(max_iter=9999)
lasso.fit(X_train_sc, y_train)
lasso_train_pred = lasso.predict(X_train_sc)
lasso_val_pred = lasso.predict(X_val_sc)
print('train', metrics.r2_score(y_train, lasso_train_pred), 'val', metrics.r2_score(y_val, lasso_val_pred))

In [None]:
ridge = RidgeCV()
ridge.fit(X_train_sc, y_train)
ridge_train_pred = ridge.predict(X_train_sc)
ridge_val_pred = ridge.predict(X_val_sc)
print('train', metrics.r2_score(y_train, ridge_train_pred), 'val', metrics.r2_score(y_val, ridge_val_pred))

In [None]:
enet = ElasticNetCV()
enet.fit(X_train_sc, y_train)
enet_train_pred = enet.predict(X_train_sc)
enet_val_pred = enet.predict(X_val_sc)
print('train', metrics.r2_score(y_train, enet_train_pred), 'val', metrics.r2_score(y_val, enet_val_pred))

In [None]:
huber = HuberRegressor(alpha=0.05)
huber.fit(X_train_sc, y_train)
huber_train_pred = huber.predict(X_train_sc)
huber_val_pred = huber.predict(X_val_sc)
print('train', metrics.r2_score(y_train, huber_train_pred), 'val', metrics.r2_score(y_val, huber_val_pred))

In [None]:
rf = RandomForestRegressor(n_estimators=5)
rf.fit(X_train_sc, y_train)
rf_train_pred = rf.predict(X_train_sc)
rf_val_pred = rf.predict(X_val_sc)
print('train', metrics.r2_score(y_train, rf_train_pred), 'val', metrics.r2_score(y_val, rf_val_pred))

As you can see from the results random forest regression model performed better than others.

# 11. predict

In [None]:
sub_df['y'] = np.round(np.exp(lasso.predict(test_df_sc)), 4)

In [None]:
sub_df.head()

In [None]:
sub_df.to_csv('sub.csv', index=False)