In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np    

from sklearn.neighbors import KNeighborsClassifier
from lightgbm import  LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.pipeline import make_pipeline
from scipy.stats import chi2_contingency 

from preprocess import separate, extra_features
from pipelines import preprocessor, preprocessor2

In [2]:
def chi(data):
    col_drop=[]
    
    for col in data.columns:
        
        ct=pd.crosstab(y, data[col])
        chi2, p, _, _ =chi2_contingency(ct)
        
        if p >= 0.05:
            col_drop.append(col)
        elif chi2 <= 3000:
            col_drop.append(col)
            
    return col_drop

In [3]:
train= pd.read_csv("Datasets\Train.csv") 
train=extra_features(train)
X=train.drop('cost_category', axis=1)
y=train['cost_category']
col_drop=chi(train)
X.drop(columns=col_drop, inplace=True)
print(f"X: {X.shape}\ny: {y.shape}")

X: (18506, 16)
y: (18506,)


In [4]:
X1=preprocessor(X)
X2=preprocessor2(X)
print(f"X1: {X1.shape}\nX2: {X2.shape}")

X1: (18506, 177)
X2: (18506, 144)


In [5]:
le=LabelEncoder()
y=le.fit_transform(y)
y.shape

(18506,)

In [7]:
models= {
    "KNN": KNeighborsClassifier(n_neighbors=5, weights="uniform", algorithm="auto"),
    "SVM": SVC(gamma='auto', C= 100),
    "GBM": LGBMClassifier(n_estimators=200, learning_rate=0.01),
    "XGB": XGBClassifier(n_estimators=200, learning_rate=0.01),
    "Cat": CatBoostClassifier(iterations=200,max_depth=8,learning_rate=0.1),
    "Forest": RandomForestClassifier(max_depth=8),
    "Tree": DecisionTreeClassifier(max_depth=8),
}


samples={
    "X1": X1,
    "X2": X2,
}

In [8]:
kfolds=StratifiedKFold(n_splits=10,shuffle=True)

def strat(modell, sample, y, kfold):

    results= {}

    results['model_name']=modell+"_"+sample
    
    
    model=models[modell]
    X= samples[sample]
    
    for fold, (train_idx, val_idx) in enumerate(kfolds.split(X,y)):
        X_train, X_test=X[train_idx], X[val_idx]
        y_train, y_test=y[train_idx], y[val_idx]
    
        model.fit(X_train, y_train)
    
        y_pred=model.predict(X_test)
        
        results["Fold_{}".format(fold+1)]=f1_score(y_pred, y_test, average='weighted')
        #results['mean']=results.mean(axis=1)
    
    #df= pd.DataFrame(results)   
    return results

In [None]:
all_models=[]

for model in models:
    for X in samples:
        all_models.append(strat(model, X, y, kfolds))
        print(f"Model: {model}_{X}......\n")

Model: KNN_X1......

Model: KNN_X2......

Model: SVM_X1......

Model: SVM_X2......

Model: GBM_X1......

Model: GBM_X2......

Model: XGB_X1......

Model: XGB_X2......

0:	learn: 1.6754449	total: 196ms	remaining: 39s
1:	learn: 1.5868437	total: 247ms	remaining: 24.4s
2:	learn: 1.5175240	total: 299ms	remaining: 19.7s
3:	learn: 1.4629949	total: 352ms	remaining: 17.2s
4:	learn: 1.4180155	total: 401ms	remaining: 15.7s
5:	learn: 1.3790351	total: 463ms	remaining: 15s
6:	learn: 1.3461250	total: 525ms	remaining: 14.5s
7:	learn: 1.3178903	total: 577ms	remaining: 13.9s
8:	learn: 1.2920741	total: 628ms	remaining: 13.3s
9:	learn: 1.2706046	total: 681ms	remaining: 12.9s
10:	learn: 1.2512105	total: 735ms	remaining: 12.6s
11:	learn: 1.2361391	total: 809ms	remaining: 12.7s
12:	learn: 1.2216854	total: 862ms	remaining: 12.4s
13:	learn: 1.2081527	total: 932ms	remaining: 12.4s
14:	learn: 1.1959390	total: 1.01s	remaining: 12.5s
15:	learn: 1.1844188	total: 1.06s	remaining: 12.2s
16:	learn: 1.1762472	total: 1.

159:	learn: 0.9868640	total: 8.65s	remaining: 2.16s
160:	learn: 0.9863343	total: 8.71s	remaining: 2.11s
161:	learn: 0.9859722	total: 8.79s	remaining: 2.06s
162:	learn: 0.9855687	total: 8.85s	remaining: 2.01s
163:	learn: 0.9846303	total: 8.9s	remaining: 1.95s
164:	learn: 0.9839473	total: 8.95s	remaining: 1.9s
165:	learn: 0.9833261	total: 9.01s	remaining: 1.84s
166:	learn: 0.9826896	total: 9.06s	remaining: 1.79s
167:	learn: 0.9820978	total: 9.13s	remaining: 1.74s
168:	learn: 0.9811845	total: 9.19s	remaining: 1.69s
169:	learn: 0.9806947	total: 9.26s	remaining: 1.63s
170:	learn: 0.9803136	total: 9.33s	remaining: 1.58s
171:	learn: 0.9793040	total: 9.41s	remaining: 1.53s
172:	learn: 0.9789506	total: 9.46s	remaining: 1.48s
173:	learn: 0.9780851	total: 9.51s	remaining: 1.42s
174:	learn: 0.9779468	total: 9.56s	remaining: 1.36s
175:	learn: 0.9773143	total: 9.6s	remaining: 1.31s
176:	learn: 0.9770026	total: 9.65s	remaining: 1.25s
177:	learn: 0.9764614	total: 9.71s	remaining: 1.2s
178:	learn: 0.97

121:	learn: 1.0124584	total: 6.82s	remaining: 4.36s
122:	learn: 1.0119753	total: 6.87s	remaining: 4.3s
123:	learn: 1.0117654	total: 6.92s	remaining: 4.24s
124:	learn: 1.0111578	total: 6.97s	remaining: 4.18s
125:	learn: 1.0107108	total: 7.02s	remaining: 4.12s
126:	learn: 1.0102224	total: 7.06s	remaining: 4.06s
127:	learn: 1.0098228	total: 7.11s	remaining: 4s
128:	learn: 1.0092452	total: 7.15s	remaining: 3.94s
129:	learn: 1.0087372	total: 7.2s	remaining: 3.88s
130:	learn: 1.0080583	total: 7.25s	remaining: 3.82s
131:	learn: 1.0072621	total: 7.3s	remaining: 3.76s
132:	learn: 1.0065483	total: 7.35s	remaining: 3.7s
133:	learn: 1.0060349	total: 7.39s	remaining: 3.64s
134:	learn: 1.0057253	total: 7.45s	remaining: 3.58s
135:	learn: 1.0054420	total: 7.49s	remaining: 3.52s
136:	learn: 1.0041439	total: 7.54s	remaining: 3.46s
137:	learn: 1.0038090	total: 7.58s	remaining: 3.4s
138:	learn: 1.0029594	total: 7.63s	remaining: 3.35s
139:	learn: 1.0022891	total: 7.67s	remaining: 3.29s
140:	learn: 1.001545

83:	learn: 1.0371764	total: 4.57s	remaining: 6.31s
84:	learn: 1.0365454	total: 4.63s	remaining: 6.26s
85:	learn: 1.0361330	total: 4.68s	remaining: 6.2s
86:	learn: 1.0353947	total: 4.73s	remaining: 6.14s
87:	learn: 1.0345851	total: 4.78s	remaining: 6.08s
88:	learn: 1.0336023	total: 4.84s	remaining: 6.04s
89:	learn: 1.0325731	total: 4.89s	remaining: 5.98s
90:	learn: 1.0322479	total: 4.92s	remaining: 5.89s
91:	learn: 1.0317326	total: 4.97s	remaining: 5.84s
92:	learn: 1.0304078	total: 5.02s	remaining: 5.78s
93:	learn: 1.0299055	total: 5.08s	remaining: 5.72s
94:	learn: 1.0294391	total: 5.12s	remaining: 5.66s
95:	learn: 1.0289574	total: 5.17s	remaining: 5.61s
96:	learn: 1.0284017	total: 5.23s	remaining: 5.55s
97:	learn: 1.0278241	total: 5.29s	remaining: 5.5s
98:	learn: 1.0269881	total: 5.34s	remaining: 5.45s
99:	learn: 1.0256310	total: 5.4s	remaining: 5.4s
100:	learn: 1.0245707	total: 5.46s	remaining: 5.35s
101:	learn: 1.0235954	total: 5.51s	remaining: 5.29s
102:	learn: 1.0230310	total: 5.57

45:	learn: 1.0797723	total: 2.13s	remaining: 7.14s
46:	learn: 1.0790588	total: 2.18s	remaining: 7.09s
47:	learn: 1.0771058	total: 2.23s	remaining: 7.05s
48:	learn: 1.0757721	total: 2.27s	remaining: 7s
49:	learn: 1.0744790	total: 2.32s	remaining: 6.96s
50:	learn: 1.0730698	total: 2.37s	remaining: 6.91s
51:	learn: 1.0714534	total: 2.41s	remaining: 6.86s
52:	learn: 1.0701360	total: 2.45s	remaining: 6.81s
53:	learn: 1.0693106	total: 2.5s	remaining: 6.76s
54:	learn: 1.0684286	total: 2.54s	remaining: 6.7s
55:	learn: 1.0675587	total: 2.59s	remaining: 6.65s
56:	learn: 1.0662666	total: 2.63s	remaining: 6.61s
57:	learn: 1.0647091	total: 2.68s	remaining: 6.56s
58:	learn: 1.0640915	total: 2.73s	remaining: 6.51s
59:	learn: 1.0632515	total: 2.77s	remaining: 6.46s
60:	learn: 1.0625349	total: 2.82s	remaining: 6.42s
61:	learn: 1.0617265	total: 2.86s	remaining: 6.37s
62:	learn: 1.0607835	total: 2.91s	remaining: 6.32s
63:	learn: 1.0596630	total: 2.95s	remaining: 6.27s
64:	learn: 1.0586772	total: 3s	remai

5:	learn: 1.3791000	total: 327ms	remaining: 10.6s
6:	learn: 1.3459104	total: 385ms	remaining: 10.6s
7:	learn: 1.3171399	total: 443ms	remaining: 10.6s
8:	learn: 1.2921256	total: 510ms	remaining: 10.8s
9:	learn: 1.2713000	total: 568ms	remaining: 10.8s
10:	learn: 1.2507090	total: 621ms	remaining: 10.7s
11:	learn: 1.2344179	total: 673ms	remaining: 10.5s
12:	learn: 1.2193104	total: 728ms	remaining: 10.5s
13:	learn: 1.2055460	total: 782ms	remaining: 10.4s
14:	learn: 1.1927026	total: 836ms	remaining: 10.3s
15:	learn: 1.1832364	total: 890ms	remaining: 10.2s
16:	learn: 1.1742676	total: 942ms	remaining: 10.1s
17:	learn: 1.1644874	total: 997ms	remaining: 10.1s
18:	learn: 1.1571386	total: 1.05s	remaining: 10s
19:	learn: 1.1504923	total: 1.11s	remaining: 9.97s
20:	learn: 1.1436768	total: 1.17s	remaining: 9.94s
21:	learn: 1.1377893	total: 1.22s	remaining: 9.9s
22:	learn: 1.1320067	total: 1.27s	remaining: 9.82s
23:	learn: 1.1269145	total: 1.33s	remaining: 9.75s
24:	learn: 1.1226604	total: 1.38s	remai

168:	learn: 0.9853382	total: 8.6s	remaining: 1.58s
169:	learn: 0.9847157	total: 8.65s	remaining: 1.53s
170:	learn: 0.9845518	total: 8.7s	remaining: 1.47s
171:	learn: 0.9839899	total: 8.74s	remaining: 1.42s
172:	learn: 0.9831715	total: 8.78s	remaining: 1.37s
173:	learn: 0.9823127	total: 8.83s	remaining: 1.32s
174:	learn: 0.9817619	total: 8.87s	remaining: 1.27s
175:	learn: 0.9813453	total: 8.93s	remaining: 1.22s
176:	learn: 0.9806230	total: 9.03s	remaining: 1.17s
177:	learn: 0.9799142	total: 9.1s	remaining: 1.12s
178:	learn: 0.9795740	total: 9.17s	remaining: 1.07s
179:	learn: 0.9789791	total: 9.23s	remaining: 1.02s
180:	learn: 0.9782962	total: 9.28s	remaining: 975ms
181:	learn: 0.9773722	total: 9.34s	remaining: 924ms
182:	learn: 0.9769180	total: 9.39s	remaining: 872ms
183:	learn: 0.9766613	total: 9.45s	remaining: 821ms
184:	learn: 0.9761476	total: 9.5s	remaining: 770ms
185:	learn: 0.9756517	total: 9.57s	remaining: 721ms
186:	learn: 0.9754722	total: 9.65s	remaining: 671ms
187:	learn: 0.97

132:	learn: 1.0055165	total: 7.66s	remaining: 3.86s
133:	learn: 1.0050138	total: 7.71s	remaining: 3.8s
134:	learn: 1.0039855	total: 7.77s	remaining: 3.74s
135:	learn: 1.0034534	total: 7.82s	remaining: 3.68s
136:	learn: 1.0029160	total: 7.87s	remaining: 3.62s
137:	learn: 1.0022158	total: 7.92s	remaining: 3.56s
138:	learn: 1.0019041	total: 7.97s	remaining: 3.5s
139:	learn: 1.0013083	total: 8.03s	remaining: 3.44s
140:	learn: 1.0007680	total: 8.08s	remaining: 3.38s
141:	learn: 1.0001021	total: 8.16s	remaining: 3.33s
142:	learn: 0.9991876	total: 8.22s	remaining: 3.28s
143:	learn: 0.9987949	total: 8.29s	remaining: 3.22s
144:	learn: 0.9978848	total: 8.35s	remaining: 3.17s
145:	learn: 0.9976267	total: 8.42s	remaining: 3.11s
146:	learn: 0.9973738	total: 8.49s	remaining: 3.06s
147:	learn: 0.9967977	total: 8.57s	remaining: 3.01s
148:	learn: 0.9964509	total: 8.62s	remaining: 2.95s
149:	learn: 0.9960713	total: 8.68s	remaining: 2.89s
150:	learn: 0.9954068	total: 8.73s	remaining: 2.83s
151:	learn: 0.

97:	learn: 1.0300258	total: 5.61s	remaining: 5.83s
98:	learn: 1.0293458	total: 5.66s	remaining: 5.77s
99:	learn: 1.0281346	total: 5.71s	remaining: 5.71s
100:	learn: 1.0278343	total: 5.76s	remaining: 5.65s
101:	learn: 1.0272830	total: 5.82s	remaining: 5.59s
102:	learn: 1.0266914	total: 5.87s	remaining: 5.52s
103:	learn: 1.0258588	total: 5.92s	remaining: 5.46s
104:	learn: 1.0249730	total: 5.97s	remaining: 5.4s
105:	learn: 1.0239426	total: 6.06s	remaining: 5.37s
106:	learn: 1.0234065	total: 6.12s	remaining: 5.32s
107:	learn: 1.0227333	total: 6.17s	remaining: 5.26s
108:	learn: 1.0220618	total: 6.23s	remaining: 5.2s
109:	learn: 1.0212024	total: 6.29s	remaining: 5.14s
110:	learn: 1.0205630	total: 6.34s	remaining: 5.08s
111:	learn: 1.0198590	total: 6.39s	remaining: 5.02s
112:	learn: 1.0191005	total: 6.45s	remaining: 4.96s
113:	learn: 1.0184612	total: 6.5s	remaining: 4.91s
114:	learn: 1.0180747	total: 6.56s	remaining: 4.85s
115:	learn: 1.0169352	total: 6.62s	remaining: 4.79s
116:	learn: 1.0160

61:	learn: 1.0585616	total: 2.88s	remaining: 6.42s
62:	learn: 1.0575920	total: 2.93s	remaining: 6.37s
63:	learn: 1.0555724	total: 2.98s	remaining: 6.32s
64:	learn: 1.0542732	total: 3.02s	remaining: 6.28s
65:	learn: 1.0531968	total: 3.07s	remaining: 6.22s
66:	learn: 1.0514939	total: 3.11s	remaining: 6.18s
67:	learn: 1.0507348	total: 3.16s	remaining: 6.13s
68:	learn: 1.0501678	total: 3.21s	remaining: 6.08s
69:	learn: 1.0495660	total: 3.25s	remaining: 6.04s
70:	learn: 1.0482027	total: 3.3s	remaining: 5.99s
71:	learn: 1.0473369	total: 3.34s	remaining: 5.94s
72:	learn: 1.0468743	total: 3.4s	remaining: 5.91s
73:	learn: 1.0459130	total: 3.46s	remaining: 5.9s
74:	learn: 1.0452233	total: 3.52s	remaining: 5.87s
75:	learn: 1.0449127	total: 3.58s	remaining: 5.84s
76:	learn: 1.0433328	total: 3.63s	remaining: 5.8s
77:	learn: 1.0425936	total: 3.69s	remaining: 5.76s
78:	learn: 1.0418290	total: 3.74s	remaining: 5.72s
79:	learn: 1.0412139	total: 3.79s	remaining: 5.69s
80:	learn: 1.0404806	total: 3.85s	r

22:	learn: 1.1370253	total: 1.43s	remaining: 11s
23:	learn: 1.1315774	total: 1.48s	remaining: 10.9s
24:	learn: 1.1273474	total: 1.53s	remaining: 10.7s
25:	learn: 1.1225606	total: 1.57s	remaining: 10.5s
26:	learn: 1.1193014	total: 1.63s	remaining: 10.4s
27:	learn: 1.1158290	total: 1.67s	remaining: 10.3s
28:	learn: 1.1124040	total: 1.72s	remaining: 10.1s
29:	learn: 1.1091208	total: 1.76s	remaining: 9.98s
30:	learn: 1.1065113	total: 1.8s	remaining: 9.84s
31:	learn: 1.1036939	total: 1.85s	remaining: 9.73s
32:	learn: 1.1018988	total: 1.9s	remaining: 9.62s
33:	learn: 1.1000272	total: 1.95s	remaining: 9.5s
34:	learn: 1.0982218	total: 1.99s	remaining: 9.38s
35:	learn: 1.0966935	total: 2.04s	remaining: 9.28s
36:	learn: 1.0945101	total: 2.08s	remaining: 9.18s
37:	learn: 1.0926598	total: 2.13s	remaining: 9.08s
38:	learn: 1.0905225	total: 2.18s	remaining: 8.98s
39:	learn: 1.0889868	total: 2.22s	remaining: 8.89s
40:	learn: 1.0877396	total: 2.27s	remaining: 8.79s
41:	learn: 1.0856141	total: 2.31s	re

182:	learn: 0.9825347	total: 9.33s	remaining: 867ms
183:	learn: 0.9822161	total: 9.39s	remaining: 817ms
184:	learn: 0.9815378	total: 9.45s	remaining: 766ms
185:	learn: 0.9808337	total: 9.51s	remaining: 716ms
186:	learn: 0.9802633	total: 9.56s	remaining: 665ms
187:	learn: 0.9799863	total: 9.61s	remaining: 613ms
188:	learn: 0.9798165	total: 9.66s	remaining: 562ms
189:	learn: 0.9788911	total: 9.72s	remaining: 511ms
190:	learn: 0.9787886	total: 9.77s	remaining: 460ms
191:	learn: 0.9783833	total: 9.83s	remaining: 409ms
192:	learn: 0.9777898	total: 9.88s	remaining: 358ms
193:	learn: 0.9772994	total: 9.93s	remaining: 307ms
194:	learn: 0.9771286	total: 10s	remaining: 257ms
195:	learn: 0.9765815	total: 10.1s	remaining: 206ms
196:	learn: 0.9762364	total: 10.1s	remaining: 155ms
197:	learn: 0.9755773	total: 10.2s	remaining: 103ms
198:	learn: 0.9752725	total: 10.3s	remaining: 51.8ms
199:	learn: 0.9748386	total: 10.5s	remaining: 0us
0:	learn: 1.6737726	total: 285ms	remaining: 56.7s
1:	learn: 1.58987

145:	learn: 0.9961781	total: 17.6s	remaining: 6.52s
146:	learn: 0.9957518	total: 17.7s	remaining: 6.37s
147:	learn: 0.9953322	total: 17.7s	remaining: 6.22s
148:	learn: 0.9951001	total: 17.8s	remaining: 6.08s
149:	learn: 0.9948399	total: 17.9s	remaining: 5.95s
150:	learn: 0.9940247	total: 18s	remaining: 5.83s
151:	learn: 0.9938480	total: 18s	remaining: 5.7s
152:	learn: 0.9935472	total: 18.1s	remaining: 5.56s
153:	learn: 0.9928536	total: 18.1s	remaining: 5.42s
154:	learn: 0.9923021	total: 18.2s	remaining: 5.29s
155:	learn: 0.9916950	total: 18.3s	remaining: 5.15s
156:	learn: 0.9912739	total: 18.3s	remaining: 5.02s
157:	learn: 0.9908681	total: 18.4s	remaining: 4.88s
158:	learn: 0.9902897	total: 18.4s	remaining: 4.75s
159:	learn: 0.9897039	total: 18.5s	remaining: 4.62s
160:	learn: 0.9895248	total: 18.5s	remaining: 4.49s
161:	learn: 0.9893044	total: 18.6s	remaining: 4.36s
162:	learn: 0.9887378	total: 18.7s	remaining: 4.23s
163:	learn: 0.9879815	total: 18.7s	remaining: 4.1s
164:	learn: 0.9873

In [None]:
models_df=pd.DataFrame(all_models)
models_df.dropna(axis=1, inplace=True)
models_df['av_score']=models_df.loc[:,"Fold_1":"Fold_10"].sum(axis=1)*100/10
models_df

In [None]:
X1.shape

In [None]:
X2.shape

In [None]:
models_df.describe().T

In [None]:
sns.set(style='white', color_codes=True)

ax=sns.catplot(x='model_name',y='av_score', kind='bar', data=models_df)
plt.title("Average KFold F1_Scores of the Baseline Models", weight='bold')
ax.set_xticklabels(rotation=90);