In [1]:
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, f1_score
from sklearn import set_config
from sklearn import datasets
from sklearn import ensemble, neighbors,linear_model,svm
from scipy.stats import zscore
import eli5
set_config(display='diagram')

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

#datasets.load*?

In [2]:
def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df

In [3]:
# df = sklearn_to_df(datasets.load_breast_cancer())
# target = 'target'
# drop_columns = []

In [4]:
# Input_df = pd.read_csv('input/pima-indians-diabetes.csv')
# target = 'class'
# drop_columns = ['skin','Pres']
# df = Input_df.drop(drop_columns, axis = 1)

In [5]:
Input_df.head()

Unnamed: 0,Preg,Plas,Pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
if df[target].dtypes == np.object:    
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).drop(target, axis=1).columns
else:
    numeric_features = df.select_dtypes(include=['int64', 'float64']).drop(target, axis=1).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    
X = df.drop((target),axis = 'columns')
X_z_score = pd.DataFrame(zscore(X),columns = X.columns)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X_z_score , y, test_size=0.3,random_state =1)

In [7]:
#ProfileReport(df)

In [8]:
# train_values = pd.read_csv('train_values.csv')
# train_labels = pd.read_csv('train_labels.csv')
# train_data = train_values.merge(train_labels, left_on='building_id', right_on='building_id')

# train_data = train_data.drop('building_id', axis=1)
# numeric_features = train_data.select_dtypes(include=['int64', 'float64']).drop(['damage_grade'], axis=1).columns
# categorical_features = train_data.select_dtypes(include=['object']).columns
# X = train_data.drop('damage_grade', axis=1)
# y = train_data['damage_grade']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn import ensemble, neighbors,linear_model,svm
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    #('feature_selection',SelectFromModel(svm.LinearSVC(penalty = 'l1',dual = False))),
    #('scaler', StandardScaler()),
    ('scaler',MinMaxScaler()),
    ('poly', PolynomialFeatures(degree = 2,interaction_only = True)),
    ])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='most_frequent')),
    ('one_hot', OneHotEncoder()),
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model',  ensemble.RandomForestRegressor(max_depth = 3))])
    
#model = pipe.fit(X_train, y_train)
pipe

In [10]:
model_list = [#regressors
                linear_model.LinearRegression(),linear_model.Lasso(),linear_model.Ridge(),svm.SVR(),
              
              #ensembling - regressors
                ensemble.RandomForestRegressor(max_depth = 3),ensemble.AdaBoostRegressor(),ensemble.GradientBoostingRegressor(),
    
              #classifiers
                linear_model.LogisticRegression(max_iter = 500), neighbors.KNeighborsClassifier(),svm.SVC(),             
    
              #ensembling - classifiers
                ensemble.RandomForestClassifier(n_estimators=500,max_depth = 3),ensemble.AdaBoostClassifier(learning_rate=0.5),ensemble.GradientBoostingClassifier(n_estimators = 200)

              ]

In [11]:
rejected_models = []
results = []
print('{:30s}Train  /  Test   /  Cross Validation'.format('Model'))
print('-----------------------------------------------------------------')
for model in model_list:
    pipe                 = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])

    try:        
        pipe.fit(X_train,y_train)

        model_name       = type(model).__name__
        train_results    = np.abs(pipe.score(X_train,y_train))
        test_results     = np.abs(pipe.score(X_test,y_test))
        cross_val_scores = np.abs(cross_val_score(pipe, X, y, cv=5)) 
        cval_score       = cross_val_scores.mean()

        results.append([model_name, train_results, test_results,cval_score])
        print('{:30s}{:.4f} / {:.4f}  /  {:.4f} '.format(model_name, train_results, test_results,cval_score))

    except:
        rejected_models.append(type(model).__name__)

df_results = pd.DataFrame(results, columns=["Model", "Train Results","Test Results",'Test Cross Validation Results'])

Model                         Train  /  Test   /  Cross Validation
-----------------------------------------------------------------
LinearRegression              0.3003 / 0.3391  /  0.2552 
Lasso                         0.0000 / 0.0032  /  0.0209 
Ridge                         0.2802 / 0.3659  /  0.2771 
SVR                           0.3055 / 0.3799  /  0.2751 
RandomForestRegressor         0.4064 / 0.3841  /  0.2948 
AdaBoostRegressor             0.3627 / 0.3096  /  0.2053 
GradientBoostingRegressor     0.7543 / 0.3665  /  0.2686 
LogisticRegression            0.7635 / 0.7835  /  0.7709 
KNeighborsClassifier          0.8194 / 0.7706  /  0.7448 
SVC                           0.7765 / 0.7922  /  0.7735 
RandomForestClassifier        0.8063 / 0.8095  /  0.7696 
AdaBoostClassifier            0.8250 / 0.7965  /  0.7631 
GradientBoostingClassifier    0.9944 / 0.8009  /  0.7565 


In [12]:
if rejected_models == []:
    print('No models rejected')
else:    
    print('Rejected Models list: ')
    for rej_models in rejected_models:
        print('  - ',rej_models)

No models rejected


In [14]:
def highlight_range(s):
    '''
    highlight the maximum in a Series green.
    '''
    is_max = s == s.max()
    #threshold = 0.95
    #val = s > threshold
    return ['background-color: lightgreen' if v else '' for v in is_max]

df_results.sort_values(by = 'Test Cross Validation Results',ascending = False)\
.style.apply(highlight_range,subset=pd.IndexSlice[:, ['Train Results', 'Test Results', 'Test Cross Validation Results']]).set_table_styles([{'selector':'','props':[('border','4px solid #7a7')]}])

Unnamed: 0,Model,Train Results,Test Results,Test Cross Validation Results
9,SVC,0.776536,0.792208,0.773491
7,LogisticRegression,0.763501,0.78355,0.770868
10,RandomForestClassifier,0.806331,0.809524,0.769578
11,AdaBoostClassifier,0.824953,0.796537,0.763093
12,GradientBoostingClassifier,0.994413,0.800866,0.75654
8,KNeighborsClassifier,0.819367,0.770563,0.744818
4,RandomForestRegressor,0.406443,0.38411,0.294776
2,Ridge,0.280154,0.36592,0.277095
3,SVR,0.305475,0.379944,0.275139
6,GradientBoostingRegressor,0.754279,0.366493,0.268555


### Feature Selection

In [None]:
# try:
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),('model',  ensemble.RandomForestRegressor(max_depth = 3))])
#     pipe.fit(X_train,y_train)
# except:
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),('model',  ensemble.RandomForestClassifier(max_depth = 3))])
#     pipe.fit(X_train,y_train)

# try:
#     onehot_columns = list(pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['one_hot']
#                           .get_feature_names(input_features=categorical_features))
# except:
#     onehot_columns = []

# numeric_features_list = list(numeric_features)
# numeric_features_list.extend(onehot_columns)
# eli5.explain_weights(pipe.named_steps['model'], top=10, feature_names=numeric_features_list)

### TroubleShoot

In [None]:
# rejected_models = []
# results = []
# print('{:30s}Train  /  Test   /  Cross Validation'.format('Model'))
# print('-----------------------------------------------------------')
# for model in model_list:
#     pipe                 = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])   
#     pipe.fit(X_train,y_train)
#     model_name       = type(model).__name__
#     train_results    = np.abs(pipe.score(X_train,y_train))
#     test_results     = np.abs(pipe.score(X_test,y_test))
#     cross_val_scores = np.abs(cross_val_score(pipe, X, y, cv=5)) 
#     cval_score       = cross_val_scores.mean()

#     print('{:30s}{:.4f} / {:.4f}  /  {:.4f} '.format(model_name, train_results, test_results,cval_score))

In [None]:
#sns.pairplot(df,diag_kind = 'kde',hue = target);

In [None]:
# def highlight_range(s):
#     '''
#     highlight the maximum in a Series green.
#     '''
#     is_max = s == s.max()
#     threshold = 0.1
#     val = s < threshold
#     return ['background-color: lightgreen' if v else '' for v in val]

# df.corr().style.apply(highlight_range)

## PCA

In [None]:
# from sklearn.decomposition import PCA

# X_scaled = StandardScaler(X)
# pca3 = PCA(n_components=6)
# pca3.fit(X)
# #print(pca3.components_)
# print(pca3.explained_variance_ratio_)
# Xpca3 = pca3.transform(X)

In [None]:
# plt.bar(list(range(1,7)),pca3.explained_variance_ratio_,alpha=0.5, align='center')
# plt.ylabel('Variation explained')
# plt.xlabel('eigen Value')
# plt.show()

In [None]:
# plt.step(list(range(1,7)),np.cumsum(pca3.explained_variance_ratio_), where='mid')
# plt.ylabel('Cum of variation explained')
# plt.xlabel('eigen Value')
# plt.show()