In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, f1_score
from sklearn import set_config
from sklearn import datasets
from sklearn import ensemble, neighbors,linear_model,svm
import eli5
set_config(display='diagram')



#datasets.load*?

In [2]:
def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df

In [3]:
# df = sklearn_to_df(datasets.load_breast_cancer())
# target = 'target'
# drop_columns = []

In [4]:
df = pd.read_csv('input/penguins.csv')
target = 'species'
drop_columns = []

In [5]:
df.dtypes

species               object
island                object
culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [6]:
df = df.drop(drop_columns, axis = 1)

if df[target].dtypes == np.object:    
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).drop(target, axis=1).columns
else:
    numeric_features = df.select_dtypes(include=['int64', 'float64']).drop(target, axis=1).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    
X = df.drop((target),axis = 'columns')
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state =1)

In [7]:
#ProfileReport(df)

In [8]:
# train_values = pd.read_csv('train_values.csv')
# train_labels = pd.read_csv('train_labels.csv')
# train_data = train_values.merge(train_labels, left_on='building_id', right_on='building_id')

# train_data = train_data.drop('building_id', axis=1)
# numeric_features = train_data.select_dtypes(include=['int64', 'float64']).drop(['damage_grade'], axis=1).columns
# categorical_features = train_data.select_dtypes(include=['object']).columns
# X = train_data.drop('damage_grade', axis=1)
# y = train_data['damage_grade']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn import ensemble, neighbors,linear_model,svm
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('feature_selection',SelectFromModel(svm.LinearSVC(penalty = 'l1',dual = False))),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree = 2,interaction_only = True)),
    ])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='most_frequent')),
    ('one_hot', OneHotEncoder()),
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model',  ensemble.RandomForestRegressor(max_depth = 3))])
    
#model = pipe.fit(X_train, y_train)
pipe

In [10]:
model_list = [#regressors
                linear_model.LinearRegression(),linear_model.Lasso(),linear_model.Ridge(),
              
              #ensembling - regressors
                ensemble.RandomForestRegressor(max_depth = 3),ensemble.AdaBoostRegressor(),ensemble.GradientBoostingRegressor(),
    
              #classifiers
                linear_model.LogisticRegression(max_iter = 500), neighbors.KNeighborsClassifier(),svm.SVC(),             
    
              #ensembling - classifiers
                ensemble.RandomForestClassifier(n_estimators=500,max_depth = 3),ensemble.AdaBoostClassifier(learning_rate=0.5),ensemble.GradientBoostingClassifier()

              ]

In [11]:
rejected_models = []
results = []
print('{:30s}Train  /  Test   /  Cross Validation'.format('Model'))
print('-----------------------------------------------------------------')
for model in model_list:
    pipe                 = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
    
    try:        
        pipe.fit(X_train,y_train)
        
        model_name       = type(model).__name__
        train_results    = np.abs(pipe.score(X_train,y_train))
        test_results     = np.abs(pipe.score(X_test,y_test))
        cross_val_scores = np.abs(cross_val_score(pipe, X, y, cv=5)) 
        cval_score       = cross_val_scores.mean()
        
        results.append([model_name, train_results, test_results,cval_score])
        print('{:30s}{:.4f} / {:.4f}  /  {:.4f} '.format(model_name, train_results, test_results,cval_score))

    except:
        rejected_models.append(type(model).__name__)

df_results = pd.DataFrame(results, columns=["Model", "Train Results","Test Results",'Test Cross Validation Results'])

Model                         Train  /  Test   /  Cross Validation
-----------------------------------------------------------------
LogisticRegression            1.0000 / 1.0000  /  0.9942 
KNeighborsClassifier          0.9958 / 1.0000  /  0.9941 
SVC                           0.9917 / 1.0000  /  0.9942 
RandomForestClassifier        0.9792 / 0.9712  /  0.9796 
AdaBoostClassifier            0.9833 / 0.9904  /  0.9855 
GradientBoostingClassifier    1.0000 / 0.9904  /  0.9738 


In [12]:
if rejected_models == []:
    print('No models rejected')
else:    
    print('Rejected Models list: ')
    for rej_models in rejected_models:
        print('  - ',rej_models)

Rejected Models list: 
  -  LinearRegression
  -  Lasso
  -  Ridge
  -  RandomForestRegressor
  -  AdaBoostRegressor
  -  GradientBoostingRegressor


In [13]:
def highlight_range(s):
    '''
    highlight the maximum in a Series green.
    '''
    is_max = s == s.max()
    #threshold = 0.95
    #val = s > threshold
    return ['background-color: lightgreen' if v else '' for v in is_max]

df_results.sort_values(by = 'Test Cross Validation Results',ascending = False)\
.style.apply(highlight_range,subset=pd.IndexSlice[:, ['Train Results', 'Test Results', 'Test Cross Validation Results']])

Unnamed: 0,Model,Train Results,Test Results,Test Cross Validation Results
0,LogisticRegression,1.0,1.0,0.99416
2,SVC,0.991667,1.0,0.99416
1,KNeighborsClassifier,0.995833,1.0,0.994118
4,AdaBoostClassifier,0.983333,0.990385,0.985465
3,RandomForestClassifier,0.979167,0.971154,0.979582
5,GradientBoostingClassifier,1.0,0.990385,0.973828


### Feature Selection

In [14]:
# try:
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),('model',  ensemble.RandomForestRegressor(max_depth = 3))])
#     pipe.fit(X_train,y_train)
# except:
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),('model',  ensemble.RandomForestClassifier(max_depth = 3))])
#     pipe.fit(X_train,y_train)

# try:
#     onehot_columns = list(pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['one_hot']
#                           .get_feature_names(input_features=categorical_features))
# except:
#     onehot_columns = []

# numeric_features_list = list(numeric_features)
# numeric_features_list.extend(onehot_columns)
# eli5.explain_weights(pipe.named_steps['model'], top=10, feature_names=numeric_features_list)

### TroubleShoot

In [15]:
# rejected_models = []
# results = []
# print('{:30s}Train  /  Test   /  Cross Validation'.format('Model'))
# print('-----------------------------------------------------------')
# for model in model_list:
#     pipe                 = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])   
#     pipe.fit(X_train,y_train)
#     model_name       = type(model).__name__
#     train_results    = np.abs(pipe.score(X_train,y_train))
#     test_results     = np.abs(pipe.score(X_test,y_test))
#     cross_val_scores = np.abs(cross_val_score(pipe, X, y, cv=5)) 
#     cval_score       = cross_val_scores.mean()

#     print('{:30s}{:.4f} / {:.4f}  /  {:.4f} '.format(model_name, train_results, test_results,cval_score))