In [3]:
%matplotlib inline
import matplotlib as plt
import seaborn as sns
import pandas as pd
import numpy as np    

from sklearn.neighbors import KNeighborsClassifier
from lightgbm import  LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.pipeline import make_pipeline
from scipy.stats import chi2_contingency 

from preprocess import separate, extra_features
from pipelines import preprocessor, preprocessor2

In [39]:
def chi(data):
    col_drop=[]
    
    for col in data.columns:
        
        ct=pd.crosstab(y, data[col])
        chi2, p, _, _ =chi2_contingency(ct)
        
        if p >= 0.05:
            col_drop.append(col)
        elif chi2 <= 3000:
            col_drop.append(col)
            
    return col_drop

In [48]:
train= pd.read_csv("Datasets\Train.csv") 
train=extra_features(train)
X=train.drop('cost_category', axis=1)
y=train['cost_category']
col_drop=chi(train)
X.drop(columns=col_drop, inplace=True)
print(f"X: {X.shape}\ny: {y.shape}")

X: (18506, 16)
y: (18506,)


In [49]:
X1=preprocessor(X)
X2=preprocessor2(X)
print(f"X1: {X1.shape}\nX2: {X2.shape}")

X1: (18506, 177)
X2: (18506, 25)


In [50]:
le=LabelEncoder()
y=le.fit_transform(y)
y.shape

(18506,)

In [51]:
models= {
    "KNN": KNeighborsClassifier(n_neighbors=5, weights="uniform", algorithm="auto"),
    "SVM": SVC(gamma='auto', kernel='rbf', C= 1000),
    "GBM": LGBMClassifier(n_estimators=600, learning_rate=0.01),
    "XGB": XGBClassifier(n_estimators=600, learning_rate=0.01),
    "Forest": RandomForestClassifier(max_depth=8),
    "Tree": DecisionTreeClassifier(max_depth=8),
}


samples={
    "X1": X1,
    "X2": X2,
}

In [60]:
kfolds=StratifiedKFold(n_splits=10,shuffle=True)

def strat(modell, sample, y, kfold):
    results= {"Fold_{}".format(fold): [] for fold in range(10)}
    
    results['model_name']=modell+"_"+sample
    
    model=models[modell]
    X= samples[sample]
    
    for fold, (train_idx, val_idx) in enumerate(kfolds.split(X,y)):
        X_train, X_test=X[train_idx], X[val_idx]
        y_train, y_test=y[train_idx], y[val_idx]
    
        model.fit(X_train, y_train)
    
        y_pred=model.predict(X_test)
        
        
        F1_Score=f1_score(y_pred, y_test, average='weighted')
        results["Fold_{}".format(fold)].append(F1_Score)
        
    df= pd.DataFrame(results)
    df['mean']=df.mean(axis=1)    
    
        
    return df

In [None]:
all_models=[]

for model in models:
    for X in samples:
        all_models.append(strat(model, X, y, kfolds))
        print(f"Model: {model}_{X}......\n")

  df['mean']=df.mean(axis=1)


Model: KNN_X1......



  df['mean']=df.mean(axis=1)


Model: KNN_X2......



  df['mean']=df.mean(axis=1)


Model: SVM_X1......



  df['mean']=df.mean(axis=1)


Model: SVM_X2......



  df['mean']=df.mean(axis=1)


Model: GBM_X1......



  df['mean']=df.mean(axis=1)


Model: GBM_X2......



  df['mean']=df.mean(axis=1)


Model: XGB_X1......

