# "Using Pipelines to preprocess and create a model"
> "Streamline your model processing"

- toc:true- branch: master
- badges: true
- comments: true
- author: Hamel Husain & Jeremy Howard
- categories: [fastpages, jupyter]


# Import the libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use("seaborn-whitegrid")

from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.preprocessing import (OrdinalEncoder, StandardScaler, 
                                   MinMaxScaler, PolynomialFeatures,
                                   PowerTransformer)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from scipy import stats
from scipy.stats import norm
from sklearn.linear_model import Lasso, RidgeCV
from sklearn.ensemble import (RandomForestRegressor, AdaBoostRegressor,
                             GradientBoostingRegressor, ExtraTreesRegressor)
from sklearn.ensemble import VotingRegressor

from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.svm import SVC 
from xgboost import XGBRegressor
%matplotlib inline

from bayes_opt import BayesianOptimization
from skopt import BayesSearchCV

import warnings
warnings.filterwarnings('ignore')

### Load the Dataset

In [None]:
X_full = pd.read_csv("../input/30-days-of-ml/train.csv", index_col='id')
X_test_full = pd.read_csv("../input/30-days-of-ml/test.csv", index_col='id')
Sample_result = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

#### EDA : Exploratory Data Analysis, this is used for getting a better insight of the data

In [None]:
(mu, sigma) = norm.fit(X_full['target'])
plt.figure(figsize = (12,6))
sns.distplot(X_full['target'], kde=True, hist=True, fit=norm)
plt.title('Target Distribution', fontsize=12)
plt.legend([f"mu:{mu}, sigma:{sigma}"],loc="best")
plt.show()

In [None]:
print(f"Skewness: {X_full['target'].skew()}")
print(f"Kurtosis: {X_full['target'].kurt()}")

In general skewness should be in range of [-0.5,0.5] and kurtosis in [-2,2].

In [None]:
X_full.describe()

### Feature Visualisation

In [None]:
num_types = X_full.select_dtypes(include=['int64', 'float64']).columns[:-1]
fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(10,25))
for indx,feature in enumerate(num_types):
    row = indx // 2
    col = indx % 2
    X_full[num_types[indx]].hist(ax=axes[row,col]);axes[row,col].set_title(f"{num_types[indx]}")   

### Outlier Detection and Removal 

In [None]:
mean = X_full['target'].mean()
std = X_full['target'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
outliers = X_full[(X_full['target'] < lower) | (X_full['target'] > upper)]
print(f"Orginal Dataset size: {X_full.shape}")
#X_full.drop(outliers.index.to_list(), inplace=True)
print(f"Number of outliers: {len(outliers)}")
print(f"New size: {X_full.shape}")

### Feature Engineering
Feature engineering is one of the most important step for any data science problem.   
It involves transforming the original raw data in a form which helps the model in learning the given function.  
It can include either skipping redundant features or transforming the original feature or adding features.  
To get an idea about the features and its relation to the given target two popular tools used are:
* Correlation
* Mutual Information  

While **correlation** is limited to only linear relationships, **mutual information** can be used for any kind of relationships. *Mutual Information* describes how presence of a given features reduces the uncertainty of the target variable. MI is lower bounded by 0 and there is no upper bound. The variables are independent if MI is 0.

In [None]:
# features = X_full.drop(['target'], axis=1, inplace=False)
# targets = X_full['target'].copy()

# for cols in features.select_dtypes("object"):
#     features[cols], _ = features[cols].factorize()
    
# discrete_features = features.dtypes == int

# mi_value = mutual_info_regression(features, targets, discrete_features=discrete_features)
# mi_value = pd.Series(mi_value, name="MI", index=features.columns)
# mi_value = mi_value.sort_values(ascending=True)
# width = np.arange(len(mi_value))
# ticks = list(mi_value.index)

# plt.figure(dpi=100, figsize=(8,5))
# plt.barh(width, mi_value)
# plt.yticks(width, ticks)
# plt.title("Mutual Information")

### Extract Features and Target

In [None]:
X_full.columns

In [None]:
X_full.dropna(axis=0, subset=['target'], inplace=True)
y = X_full['target']
X_full.drop(['target'], axis=1, inplace=True)

#### Taking the top N features

In [None]:
X = X_full
X_test = X_test_full

### Analyse the Dataset

In [None]:
print(f"Shape of training data: {X.shape}")
missing_values = X.isnull().sum()
print(missing_values[missing_values > 0])

### Preprocess the Input Data 

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'bool']).columns

#### Transform the skewed features to normal distribution

In [None]:
skewed_features = X[num_cols].apply(lambda x: stats.skew(x)).sort_values(ascending=False)
skewed_features = skewed_features[abs(skewed_features) > 0.75]
print(skewed_features)

for f in skewed_features.index:
    X[f] = np.log1p(X[f])
    X_test[f] = np.log1p(X_test[f])

In [None]:
print("Number of unique category for each categorical Feature")
for cols in cat_cols:
    print(f"{cols}: {X[cols].nunique()}")

In [None]:
cat_transformer = OrdinalEncoder()
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_transformer, cat_cols),
    ('num', num_transformer, num_cols)])

### Training

In [None]:
def train(model):
    
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', model)
                     ])

    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    ypred = 0
    total_loss = 0
    for train_indx, test_indx in cv.split(X):
        X_train, X_val = X.iloc[train_indx], X.iloc[test_indx]
        y_train, y_val = y.iloc[train_indx], y.iloc[test_indx]
        clf.fit(X_train, y_train)
        
        yhat = clf.predict(X_val)
        score = mean_squared_error(yhat, y_val, squared=False)
        print(f"Loss:{score}")
        ypred += clf.predict(X_test) / 10
        total_loss += score / 10
        
    print(f"Avg. Loss: {total_loss}")     
    return ypred
    

In [None]:
model = XGBRegressor(n_estimators= 10000, booster='gbtree', tree_method='gpu_hist', 
                    learning_rate= 0.034682894846408095, subsample= 0.9219010649982458, max_depth= 3,
                    colsample_bytree = 0.11807135201147481, reg_alpha = 36.043214512614476, random_state = 1,
                    reg_lambda = 1.224383455634919, n_jobs=-1, min_child_weight=6)

final_prediction = train(model)

### Final Submission

In [None]:
Sample_result['target'] = final_prediction 
Sample_result.to_csv("submission.csv", index=False)