In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve,precision_recall_curve,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from scipy.sparse import csr_matrix
from eli5.sklearn import PermutationImportance

import eli5
import xgboost as xgbs
import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick import classifier, features, regressor
import pickle
import re

In [2]:
datafile_train=r'C:\Users\Priyanshi\Downloads\py_data\Consumer_Complaints_train.csv'
datafile_test=r'C:\Users\Priyanshi\Downloads\py_data\Consumer_Complaints_test_share.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [3]:
cd_train.shape,cd_test.shape

((478421, 18), (119606, 17))

In [4]:
cd_train.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [5]:
cd_test.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Complaint ID'],
      dtype='object')

In [6]:
cd_train.nunique()

Date received                     1759
Product                             12
Sub-product                         47
Issue                               95
Sub-issue                           68
Consumer complaint narrative     74019
Company public response             10
Company                           3276
State                               62
ZIP code                         25962
Tags                                 3
Consumer consent provided?           4
Submitted via                        6
Date sent to company              1706
Company response to consumer         7
Timely response?                     2
Consumer disputed?                   2
Complaint ID                    478421
dtype: int64

In [7]:
cd_train.dtypes

Date received                   object
Product                         object
Sub-product                     object
Issue                           object
Sub-issue                       object
Consumer complaint narrative    object
Company public response         object
Company                         object
State                           object
ZIP code                        object
Tags                            object
Consumer consent provided?      object
Submitted via                   object
Date sent to company            object
Company response to consumer    object
Timely response?                object
Consumer disputed?              object
Complaint ID                     int64
dtype: object

In [8]:
cd_train.isnull().sum()

Date received                        0
Product                              0
Sub-product                     138473
Issue                                0
Sub-issue                       292625
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Tags                            411215
Consumer consent provided?      342934
Submitted via                        0
Date sent to company                 0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
Complaint ID                         0
dtype: int64

In [9]:
#substituing space between column names with underscore
cd_train.columns=[re.sub("-|\s","_",item) for item in cd_train.columns]
cd_test.columns=[re.sub("-|\s","_",item) for item in cd_test.columns]

In [10]:
cd_train.head(2)

Unnamed: 0,Date_received,Product,Sub_product,Issue,Sub_issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Consumer_disputed?,Complaint_ID
0,2014-05-15,Credit card,,Billing statement,,,,Wells Fargo & Company,MI,48342,Older American,,Web,2014-05-16,Closed with explanation,Yes,No,856103
1,2014-09-18,Bank account or service,(CD) Certificate of deposit,"Making/receiving payments, sending money",,,,Santander Bank US,PA,18042,,,Referral,2014-09-24,Closed,Yes,No,1034666


In [11]:
cd_test.head(2)

Unnamed: 0,Date_received,Product,Sub_product,Issue,Sub_issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response?,Complaint_ID
0,2014-01-18,Bank account or service,Cashing a check without an account,Deposits and withdrawals,,,,Bank of America,CA,95691,,,Web,2014-01-17,Closed with explanation,Yes,675956
1,2016-03-31,Debt collection,Credit card,Cont'd attempts collect debt not owed,Debt was paid,,,"National Credit Adjusters, LLC",FL,32086,,Consent not provided,Web,2016-03-31,Closed with explanation,Yes,1858795


In [12]:
cd_train["Date_received"].value_counts()

2015-08-27    767
2015-08-26    731
2014-06-26    697
2015-07-08    678
2015-08-12    654
             ... 
2016-09-18      6
2016-09-21      4
2016-09-22      4
2014-03-09      3
2016-09-23      1
Name: Date_received, Length: 1759, dtype: int64

In [13]:
#cd_train["Date received"]=pd.to_datetime(cd_train["Date received"],infer_datetime_format=True)

In [14]:
for col in ['Date_received','Date_sent_to_company']:
    cd_train[col]=pd.to_datetime(cd_train[col],infer_datetime_format=True)
    cd_test[col]=pd.to_datetime(cd_test[col],infer_datetime_format=True)

In [17]:
cd_train['day_diff']=(cd_train['Date_sent_to_company']-cd_train['Date_received']).dt.days
cd_test['day_diff']=(cd_test['Date_sent_to_company']-cd_test['Date_received']).dt.days

In [18]:
for col in ['Date_received','Date_sent_to_company']:
    cd_train.drop([col],1,inplace=True)
    cd_test.drop([col],1,inplace=True)

In [19]:
cd_train["day_diff"].value_counts()

0      209750
1       58939
2       38005
3       30711
4       28639
        ...  
571         1
612         1
356         1
573         1
511         1
Name: day_diff, Length: 398, dtype: int64

In [20]:
cd_test["day_diff"].value_counts()

0      52518
1      14598
2       9608
3       7758
4       7121
       ...  
359        1
285        1
250        1
589        1
330        1
Name: day_diff, Length: 290, dtype: int64

In [21]:
cd_train["Consumer_disputed?"].value_counts()

No     376990
Yes    101431
Name: Consumer_disputed?, dtype: int64

In [22]:
cd_train["Consumer_disputed?"]=np.where(cd_train["Consumer_disputed?"]=="Yes",1,0)

In [23]:
cd_train["Consumer_disputed?"].value_counts()

0    376990
1    101431
Name: Consumer_disputed?, dtype: int64

In [24]:
cd_train["Product"].value_counts() #ohe

Mortgage                   156175
Debt collection             86544
Credit reporting            81115
Credit card                 57358
Bank account or service     54403
Consumer Loan               18599
Student loan                14918
Money transfers              3349
Payday loan                  3219
Prepaid card                 2226
Other financial service       507
Virtual currency                8
Name: Product, dtype: int64

In [25]:
probs=round(cd_train.groupby("Product")["Consumer_disputed?"].mean(),2).to_dict()

In [26]:
probs.items()

dict_items([('Bank account or service', 0.2), ('Consumer Loan', 0.24), ('Credit card', 0.22), ('Credit reporting', 0.18), ('Debt collection', 0.2), ('Money transfers', 0.15), ('Mortgage', 0.24), ('Other financial service', 0.23), ('Payday loan', 0.17), ('Prepaid card', 0.15), ('Student loan', 0.21), ('Virtual currency', 0.38)])

In [27]:
def mapping_func(df,x,y,prefix="pro_"):
    probs=round(cd_train.groupby(x)[y].mean(),2).to_dict()
    mapping_dict=dict()
    for k,v in probs.items():
        mapping_dict[k]=prefix+str(v).replace(".","")
    return mapping_dict

In [28]:
cd_train=cd_train.assign(
    Product=cd_train["Product"].map(mapping_func(cd_train,"Product","Consumer_disputed?","prod_")),
    Sub_product=cd_train["Sub_product"].
    map(mapping_func(cd_train,"Sub_product","Consumer_disputed?","subprod_")),
    Issue=cd_train["Issue"].map(mapping_func(cd_train,"Issue","Consumer_disputed?","Issue_")),
    Company_public_response=cd_train["Company_public_response"].
    map(mapping_func(cd_train,"Company_public_response","Consumer_disputed?","Cps_")),
    Sub_issue=cd_train["Sub_issue"].map(mapping_func(cd_train,"Sub_issue","Consumer_disputed?","Sub_iss_")),
    State=cd_train["State"].map(mapping_func(cd_train,"State","Consumer_disputed?","State_")))


In [29]:
cd_train["Product"].value_counts()

prod_024    174774
prod_02     140947
prod_018     81115
prod_022     57358
prod_021     14918
prod_015      5575
prod_017      3219
prod_023       507
prod_038         8
Name: Product, dtype: int64

In [30]:
#cd_train["Sub_product"].value_counts()

In [32]:
cd_train.head(4)

Unnamed: 0,Product,Sub_product,Issue,Sub_issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided?,Submitted_via,Company_response_to_consumer,Timely_response?,Consumer_disputed?,Complaint_ID,day_diff
0,prod_022,,Issue_019,,,,Wells Fargo & Company,State_021,48342,Older American,,Web,Closed with explanation,Yes,0,856103,1
1,prod_02,subprod_027,Issue_021,,,,Santander Bank US,State_021,18042,,,Referral,Closed,Yes,0,1034666,6
2,prod_018,,Issue_016,Sub_iss_014,,,Equifax,State_022,92427,,,Referral,Closed with non-monetary relief,Yes,0,756363,21
3,prod_022,,Issue_019,,"My credit card statement from US Bank, XXXX. X...",Cps_019,U.S. Bancorp,State_021,305XX,Older American,Consent provided,Web,Closed with monetary relief,Yes,0,1474177,0


In [33]:
# for col in cd_train.select_dtypes(['object']).columns:
#     print(col,':',cd_train[col].nunique())
cd_train.nunique(axis=0,dropna=True)

Product                              9
Sub_product                         23
Issue                               24
Sub_issue                           23
Consumer_complaint_narrative     74019
Company_public_response              8
Company                           3276
State                               15
ZIP_code                         25962
Tags                                 3
Consumer_consent_provided?           4
Submitted_via                        6
Company_response_to_consumer         7
Timely_response?                     2
Consumer_disputed?                   2
Complaint_ID                    478421
day_diff                           398
dtype: int64

In [34]:
cd_train["Tags"].value_counts() #ohe

Older American                   39064
Servicemember                    22592
Older American, Servicemember     5550
Name: Tags, dtype: int64

In [35]:
cd_train["Submitted_via"].value_counts() #ohe

Web            313916
Referral        91352
Phone           34417
Postal mail     31448
Fax              7032
Email             256
Name: Submitted_via, dtype: int64

In [36]:
cd_train["Consumer_consent_provided?"].value_counts()

Consent provided        75095
Consent not provided    56147
Other                    4242
Consent withdrawn           3
Name: Consumer_consent_provided?, dtype: int64

In [37]:
cd_train["Consumer_consent_provided?"]=cd_train["Consumer_consent_provided?"].str.replace(" ","_")

In [38]:
cd_train["Consumer_consent_provided?"].value_counts() #ohe

Consent_provided        75095
Consent_not_provided    56147
Other                    4242
Consent_withdrawn           3
Name: Consumer_consent_provided?, dtype: int64

In [39]:
cd_train["Company_response_to_consumer"]=cd_train["Company_response_to_consumer"].str.replace(" ","_")
cd_train["Company_response_to_consumer"].value_counts() #ohe

Closed_with_explanation            354310
Closed_with_non-monetary_relief     61491
Closed_with_monetary_relief         32925
Closed_without_relief               14145
Closed                              11365
Closed_with_relief                   4184
Untimely_response                       1
Name: Company_response_to_consumer, dtype: int64

In [40]:
cd_train["Timely_response?"].value_counts() #ohe

Yes    470277
No       8144
Name: Timely_response?, dtype: int64

In [41]:
#dropping variables that have many unique values
cd_train.drop(["Consumer_complaint_narrative","Company","ZIP_code","Complaint_ID"],1,inplace=True)

In [42]:
cd_train.head(2)

Unnamed: 0,Product,Sub_product,Issue,Sub_issue,Company_public_response,State,Tags,Consumer_consent_provided?,Submitted_via,Company_response_to_consumer,Timely_response?,Consumer_disputed?,day_diff
0,prod_022,,Issue_019,,,State_021,Older American,,Web,Closed_with_explanation,Yes,0,1
1,prod_02,subprod_027,Issue_021,,,State_021,,,Referral,Closed,Yes,0,6


In [43]:
cd_train.isnull().sum() #SimpleImputer

Product                              0
Sub_product                     138473
Issue                                0
Sub_issue                       292625
Company_public_response         388029
State                             3839
Tags                            411215
Consumer_consent_provided?      342934
Submitted_via                        0
Company_response_to_consumer         0
Timely_response?                     0
Consumer_disputed?                   0
day_diff                             0
dtype: int64

In [44]:
cd_train.isnull().sum()*100/len(cd_train) #percentage

Product                          0.000000
Sub_product                     28.943755
Issue                            0.000000
Sub_issue                       61.164748
Company_public_response         81.106181
State                            0.802431
Tags                            85.952540
Consumer_consent_provided?      71.680382
Submitted_via                    0.000000
Company_response_to_consumer     0.000000
Timely_response?                 0.000000
Consumer_disputed?               0.000000
day_diff                         0.000000
dtype: float64

In [45]:
cd_test.isnull().sum()*100/len(cd_test)

Product                          0.000000
Sub_product                     29.055399
Issue                            0.000000
Sub_issue                       61.083892
Consumer_complaint_narrative    84.484892
Company_public_response         80.957477
Company                          0.000000
State                            0.773373
ZIP_code                         0.774209
Tags                            85.894520
Consumer_consent_provided?      71.687039
Submitted_via                    0.000836
Company_response_to_consumer     0.000000
Timely_response?                 0.000000
Complaint_ID                     0.000000
day_diff                         0.000000
dtype: float64

In [48]:
cd_train.dtypes

Product                         object
Sub_product                     object
Issue                           object
Sub_issue                       object
Company_public_response         object
State                           object
Tags                            object
Consumer_consent_provided?      object
Submitted_via                   object
Company_response_to_consumer    object
Timely_response?                object
Consumer_disputed?               int32
day_diff                         int64
dtype: object

Train Test split

In [49]:
x_train,x_test= train_test_split(cd_train,test_size=.2,random_state=1)

In [50]:
x_train1=x_train.drop(["Consumer_disputed?"],1)
y_train1=x_train["Consumer_disputed?"]

In [51]:
x_test1=x_test.drop(["Consumer_disputed?"],1)
y_test1=x_test["Consumer_disputed?"]

In [52]:
num_cols=x_train1.select_dtypes(np.number).columns

In [53]:
char_cols=x_train1.select_dtypes(object).columns

In [54]:
#pipeline works sequentially.. 
pipe_num= make_pipeline(SimpleImputer(strategy="median"),StandardScaler())
pipe_char=make_pipeline(SimpleImputer(strategy="constant",fill_value="Missing"),
                       OneHotEncoder(handle_unknown="ignore"))

In [55]:
#transformer works parallely
ctrans=make_column_transformer((pipe_num,num_cols),(pipe_char,char_cols))

In [56]:
ctrans.fit_transform(x_train1)

<382736x131 sparse matrix of type '<class 'numpy.float64'>'
	with 4592832 stored elements in Compressed Sparse Row format>

In [57]:
ctrans.transform(cd_test)

<119606x131 sparse matrix of type '<class 'numpy.float64'>'
	with 773708 stored elements in Compressed Sparse Row format>

### Logistic Regression

In [58]:
logreg=LogisticRegression(
   solver="liblinear",
    penalty="l1",
    class_weight="balanced",
    random_state=1,
    max_iter=800,)

In [59]:
logreg

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=800, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
pipe=make_pipeline(ctrans,logreg)

In [61]:
pipe

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

In [62]:
pipe.fit(x_train1,y_train1)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

In [63]:
pipe.predict(x_train1)

array([1, 1, 1, ..., 0, 0, 1])

In [64]:
pipe.predict(x_test1)

array([0, 0, 1, ..., 1, 0, 0])

In [65]:
roc_auc_score(y_train1,pipe.predict_proba(x_train1)[:,1])

0.6209888506242285

In [66]:
roc_auc_score(y_test1,pipe.predict_proba(x_test1)[:,1])

0.6205036820781602

##### Now for whole data roc_auc_score

In [67]:
roc_auc_score(y_train1,pipe.predict_proba(x_train)[:,1])

0.6209888506242285

In [68]:
roc_auc_score(y_test1,pipe.predict_proba(x_test)[:,1])

0.6205036820781602

In [69]:
pipe.predict(cd_test)

array([1, 1, 1, ..., 0, 0, 1])

In [70]:
submission=pipe.predict(cd_test)

In [72]:
submission=pd.DataFrame(data=submission)

In [73]:
type(submission)

pandas.core.frame.DataFrame

In [74]:
submission

Unnamed: 0,0
0,1
1,1
2,1
3,0
4,1
...,...
119601,1
119602,1
119603,0
119604,0


In [75]:
submission.columns=["Consumer disputed?"]

In [76]:
submission["Consumer disputed?"]=np.where(submission["Consumer disputed?"]==1,"Yes","No")

In [77]:
submission["Complaint ID"]=cd_test["Complaint_ID"]

In [78]:
submission=submission[["Complaint ID","Consumer disputed?"]]

In [79]:
submission

Unnamed: 0,Complaint ID,Consumer disputed?
0,675956,Yes
1,1858795,Yes
2,32637,Yes
3,1731374,No
4,501487,Yes
...,...,...
119601,153482,Yes
119602,2023523,Yes
119603,1413678,No
119604,732458,No


In [80]:
submission.to_csv("submission1.csv",index=False)

In [None]:
#cross_val_score(pipe, x_train1, y_train1, scoring="accuracy")

### Grid Search Cv

In [81]:
params={}
params["logisticregression__penalty"]=["l1","l2"]
params["logisticregression__C"]=[.1,.01,1,10]

In [82]:
logreg=LogisticRegression(solver="liblinear",random_state=1)
pipe=make_pipeline(ctrans,logreg)

In [83]:
grid=GridSearchCV(pipe,params,cv=5,scoring="accuracy")

In [84]:
grid

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                  

In [85]:
grid.fit(x_train1,y_train1)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                  

In [86]:
roc_auc_score(y_train1,grid.predict_proba(x_train1)[:,1])

0.6210998587592601

In [87]:
roc_auc_score(y_test1,grid.predict_proba(x_test1)[:,1])

0.6206961740333149

In [88]:
dfs=pd.DataFrame(grid.cv_results_)

In [89]:
dfs

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_logisticregression__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,144.837716,18.865736,0.394867,0.036591,0.1,l1,"{'logisticregression__C': 0.1, 'logisticregres...",0.787846,0.787856,0.787843,0.787843,0.787843,0.787846,5e-06,4
1,5.702942,0.306806,0.365245,0.01503,0.1,l2,"{'logisticregression__C': 0.1, 'logisticregres...",0.787859,0.787856,0.787843,0.787843,0.787843,0.787849,7e-06,3
2,45.554518,4.161865,0.413077,0.024523,0.01,l1,"{'logisticregression__C': 0.01, 'logisticregre...",0.787846,0.787856,0.787843,0.787843,0.787843,0.787846,5e-06,4
3,4.85544,0.48611,0.436092,0.054742,0.01,l2,"{'logisticregression__C': 0.01, 'logisticregre...",0.787846,0.787856,0.787843,0.787843,0.787843,0.787846,5e-06,4
4,134.958645,15.782655,0.368447,0.010329,1.0,l1,"{'logisticregression__C': 1, 'logisticregressi...",0.787885,0.787869,0.787856,0.78783,0.787882,0.787864,2e-05,1
5,8.758462,0.75954,0.361168,0.015217,1.0,l2,"{'logisticregression__C': 1, 'logisticregressi...",0.787872,0.787843,0.787856,0.787817,0.787882,0.787854,2.3e-05,2
6,6.208883,0.226527,0.358011,0.011017,10.0,l1,"{'logisticregression__C': 10, 'logisticregress...",0.787872,0.787817,0.787856,0.787791,0.787882,0.787843,3.5e-05,8
7,15.498917,2.684697,0.368123,0.020827,10.0,l2,"{'logisticregression__C': 10, 'logisticregress...",0.787872,0.78783,0.787856,0.787791,0.787882,0.787846,3.3e-05,7


RandomForest

In [46]:
strings = 'randomforestclassifier__'

params_dict = {'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)],
'max_features' : ['auto', 'sqrt'],
'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)],
'min_samples_split' : [ 10 , 20, 25],
'min_samples_leaf' : [ 4, 8, 10],
'bootstrap' : [True, False]}

In [47]:
params_dict.items()

dict_items([('n_estimators', [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000]), ('max_features', ['auto', 'sqrt']), ('max_depth', [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]), ('min_samples_split', [10, 20, 25]), ('min_samples_leaf', [4, 8, 10]), ('bootstrap', [True, False])])

In [48]:
params_dict ={strings +k :v for k,v in params_dict.items()}

In [49]:
params_dict

{'randomforestclassifier__n_estimators': [200,
  288,
  377,
  466,
  555,
  644,
  733,
  822,
  911,
  1000],
 'randomforestclassifier__max_features': ['auto', 'sqrt'],
 'randomforestclassifier__max_depth': [10,
  20,
  30,
  40,
  50,
  60,
  70,
  80,
  90,
  100,
  110],
 'randomforestclassifier__min_samples_split': [10, 20, 25],
 'randomforestclassifier__min_samples_leaf': [4, 8, 10],
 'randomforestclassifier__bootstrap': [True, False]}

In [50]:
rf=RandomForestClassifier(random_state=1)
pipe=make_pipeline(ctrans,rf)

In [51]:
ctrans.fit_transform(x_train1)

<382736x131 sparse matrix of type '<class 'numpy.float64'>'
	with 4592832 stored elements in Compressed Sparse Row format>

In [52]:
grid=RandomizedSearchCV(pipe,params_dict,cv=5,scoring="accuracy")

In [None]:
grid.fit(x_train1,y_train1)

In [None]:
roc_auc_score(y_train1,grid.predict_proba(x_train1)[:,1])

In [None]:
roc_auc_score(y_test1,grid.predict_proba(x_test1)[:,1])

In [None]:
#random forest takes too much time.. we ll use xgboost

XGBOOST

In [90]:
parameters = {
    'max_depth':range(2,10,1),
    'n_estimators': [40,60,100,150],
    'learning_rate': [0.1, 0.01,0.05], #shrinkage.. reduces overfitting
    'reg_lambda':[0.1,0.01,1,10],
    'reg_alpha':[0.1,0.01,1,10],
}

In [91]:
parameters.items()

dict_items([('max_depth', range(2, 10)), ('n_estimators', [40, 60, 100, 150]), ('learning_rate', [0.1, 0.01, 0.05]), ('reg_lambda', [0.1, 0.01, 1, 10]), ('reg_alpha', [0.1, 0.01, 1, 10])])

In [92]:
strings="xgb__"

xgb_params={}
for k,v in parameters.items():
    xgb_params[strings+k]=v

In [93]:
xgb_params

{'xgb__max_depth': range(2, 10),
 'xgb__n_estimators': [40, 60, 100, 150],
 'xgb__learning_rate': [0.1, 0.01, 0.05],
 'xgb__reg_lambda': [0.1, 0.01, 1, 10],
 'xgb__reg_alpha': [0.1, 0.01, 1, 10]}

In [94]:
xgb=xgbs.XGBClassifier()
pipe=Pipeline([("columntransfer",ctrans),("xgb",xgb)])

In [95]:
grid=RandomizedSearchCV(pipe,xgb_params,cv=5,scoring="accuracy")

In [96]:
grid.fit(x_train1,y_train1)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('columntransfer',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('simpleimputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                      

In [97]:
roc_auc_score(y_train1,grid.predict_proba(x_train1)[:,1])

0.6214266876127915

In [98]:
roc_auc_score(y_test1,grid.predict_proba(x_test1)[:,1])

0.6189434338409635

In [99]:
roc_auc_score(y_train1,grid.predict_proba(x_train)[:,1])

0.6214266876127915

In [100]:
roc_auc_score(y_test1,grid.predict_proba(x_test)[:,1])

0.6189434338409635

In [101]:
grid.best_params_

{'xgb__reg_lambda': 10,
 'xgb__reg_alpha': 10,
 'xgb__n_estimators': 60,
 'xgb__max_depth': 8,
 'xgb__learning_rate': 0.01}

## default importance

In [103]:
grid.best_estimator_["xgb"].feature_importances_[0:10]

array([7.58195471e-04, 1.78985833e-03, 1.51755648e-05, 0.00000000e+00,
       2.97765923e-03, 2.18038005e-03, 3.51556321e-03, 0.00000000e+00,
       1.28275575e-02, 0.00000000e+00], dtype=float32)

In [105]:
onehotcols=ctrans.named_transformers_['pipeline-2']['onehotencoder'].get_feature_names()

In [108]:
all_cols=list(num_cols)+list(onehotcols)

In [117]:
df=pd.DataFrame(grid.best_estimator_["xgb"].feature_importances_,index=all_cols).reset_index(drop=False)

In [118]:
df

Unnamed: 0,index,0
0,day_diff,0.000758
1,x0_prod_015,0.001790
2,x0_prod_017,0.000015
3,x0_prod_018,0.000000
4,x0_prod_02,0.002978
...,...,...
126,x9_Closed_with_relief,0.000000
127,x9_Closed_without_relief,0.147216
128,x9_Untimely_response,0.000000
129,x10_No,0.000114
