In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from IPython.display import clear_output
import time
import catboost
import re
import optuna
import json
import sys
sys.path.append('../..')
import main

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv(r'../data/Faults.NNA', delimiter='\\s+', names=col_names)

X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scatch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scatch, stains, dirtiness, bumps, other_faults]
y_names = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]
    
class PCA_Transformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_components):
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components, random_state=0)
        
    def fit(self, X, y=None):
        self.pca.fit(X)
        return self
    
    def transform(self,X):
        cols = self.pca.transform(X)
        return np.c_[X, cols]

# n_clusters(4,5,7)

In [2]:
y_params_dict = {}
for y_nm in y_names:
    y_params_dict[y_nm] = {}
    for i in [4,5,7]:
        if i ==1:
            continue
        y_params_dict[y_nm][i] = None

In [3]:
y_params_dict

{'pastry': {4: None, 5: None, 7: None},
 'z_scratch': {4: None, 5: None, 7: None},
 'k_scatch': {4: None, 5: None, 7: None},
 'stains': {4: None, 5: None, 7: None},
 'dirtiness': {4: None, 5: None, 7: None},
 'bumps': {4: None, 5: None, 7: None},
 'other_faults': {4: None, 5: None, 7: None}}

In [4]:
y_scores = {}
for yns in y_names:
    y_scores[yns] = {}
    for n in [4,5,7]:
        y_scores[yns][n] = None

In [5]:
cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for y,yn in zip(ys,y_names):
    for n,i in enumerate([4,5,7]):
        
        print(f'trial: {yn}(n_cluster={i})')
        
        def objective_xgb(trial):
    
            cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
            params = dict(
                n_estimators = trial.suggest_int('n_estimators',100,500),
                max_depth = trial.suggest_int('max_depth',2,64),
                max_leaves = trial.suggest_int('max_leaves',2,128),
                learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
                gamma = trial.suggest_float('gamma',0.001,10),
                min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
                subsample = trial.suggest_float('subsample', 0.33,0.85),
                colsample_bytree = trial.suggest_float('colsample_bytree',0.33,0.7),
                reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
                reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
            )
            
            xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
            
            pipe = Pipeline(
                steps = [
                    ('scaler', StandardScaler()),
                    ('kmeans', KMeansTransformer(n_clusters=i)),
                    ('model', xgbc)
                ]
            )
            
            score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
            return score
        
        study_lgb = optuna.create_study(direction='maximize')
        
        study_lgb.optimize(objective_xgb, n_trials=1000, n_jobs=-1, show_progress_bar=True)
        
        best_params = study_lgb.best_params
        y_params_dict[yn][i] = best_params
        
        best_score = study_lgb.best_value
        y_scores[yn][i] = best_score
        
        time.sleep(5)
        clear_output()
        
description = 'xgbc experiment with varying n_clusters(4,5,7) along with hpt'
main.saver(y_params_dict,y_scores,description)

In [7]:
with open('../../artifacts/10_Mar_17_05_48.json','r') as file:
    res_dict = json.load(file)

In [10]:
tot_res = 0
for key in res_dict.keys():
    result = 0
    n_c = 0
    for sub_key in res_dict[key]:
        if res_dict[key][sub_key] > result:
            result = res_dict[key][sub_key]
            n_c = int(sub_key)
            
    print(f'{key}, n_cluster:{n_c}, score: {result}')
    tot_res += result
    
print(f'\nfinal_score: {tot_res/7}')      

pastry, n_cluster:4, score: 0.87316342526509
z_scratch, n_cluster:5, score: 0.9615976308667188
k_scatch, n_cluster:7, score: 0.9861330613828573
stains, n_cluster:4, score: 0.9929431745506421
dirtiness, n_cluster:5, score: 0.8965788929674312
bumps, n_cluster:7, score: 0.8117126719857447
other_faults, n_cluster:5, score: 0.7101061980300873

final_score: 0.8903192935783674


In [17]:
pastry_params = y_params_dict['pastry'][4]

pastry_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=4)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**pastry_params))
    ]
)
pastry_m.fit(X,pastry)
pastry_pred = pastry_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [18]:
z_scratch_params = y_params_dict['z_scratch'][5]

z_scratch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **z_scratch_params))
    ]
)
z_scratch_m.fit(X,z_scratch)
z_scratch_pred = z_scratch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [19]:
k_scatch_params = y_params_dict['k_scatch'][7]

k_scatch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=7)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**k_scatch_params))
    ]
)
k_scatch_m.fit(X,k_scatch)
k_scatch_pred = k_scatch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [20]:
stains_params = y_params_dict['stains'][4]

stains_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=4)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**stains_params))
    ]
)
stains_m.fit(X,stains)
stains_pred = stains_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [21]:
dirtiness_params = y_params_dict['dirtiness'][5]

dirtiness_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**dirtiness_params))
    ]
)
dirtiness_m.fit(X,dirtiness)
dirtiness_pred = dirtiness_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [22]:
bumps_params = y_params_dict['bumps'][7]

bumps_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=7)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**bumps_params))
    ]
)
bumps_m.fit(X,bumps)
bumps_pred = bumps_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [23]:
other_faults_params = y_params_dict['other_faults'][5]

other_faults_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**other_faults_params))
    ]
)
other_faults_m.fit(X,other_faults)
other_faults_pred = other_faults_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [24]:
sub = pd.DataFrame({'id':test_org['id'].copy(), 'Pastry':pastry_pred, 'Z_Scratch':z_scratch_pred, 'K_Scatch':k_scatch_pred,
                    'Stains':stains_pred, 'Dirtiness':dirtiness_pred, 'Bumps':bumps_pred, 'Other_Faults':other_faults_pred})

In [25]:
sub.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.505621,0.000767,0.005444,0.000113,0.019555,0.167881,0.346796
1,19220,0.250375,0.022483,0.006204,0.000227,0.10361,0.17973,0.322578
2,19221,0.001104,0.049589,0.057161,0.000598,0.005785,0.302705,0.467896
3,19222,0.159196,0.001625,0.000683,0.001228,0.011518,0.317744,0.441201
4,19223,0.002299,0.001828,0.001238,0.008662,0.009405,0.62608,0.384349


In [26]:
# 0.505621+0.000767+0.005444+0.000113+0.019555+0.167881+0.346796

1.0461770000000001

In [27]:
sub.to_csv('../submissions/m10.csv', index=False)

# n_clusters(8)

In [32]:
y_params_dict = {}
for y_nm in y_names:
    y_params_dict[y_nm] = {}
    for i in [8]:
        if i ==1:
            continue
        y_params_dict[y_nm][i] = None

In [33]:
y_params_dict

{'pastry': {8: None},
 'z_scratch': {8: None},
 'k_scatch': {8: None},
 'stains': {8: None},
 'dirtiness': {8: None},
 'bumps': {8: None},
 'other_faults': {8: None}}

In [34]:
y_scores = {}
for yns in y_names:
    y_scores[yns] = {}
    for n in [8]:
        y_scores[yns][n] = None

In [35]:
cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for y,yn in zip(ys,y_names):
    for i in [8]:
        
        print(f'trial: {yn}(n_cluster={i})')
        
        def objective_xgb(trial):
    
            cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
            params = dict(
                n_estimators = trial.suggest_int('n_estimators',100,500),
                max_depth = trial.suggest_int('max_depth',2,64),
                max_leaves = trial.suggest_int('max_leaves',2,128),
                learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
                gamma = trial.suggest_float('gamma',0.001,10),
                min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
                subsample = trial.suggest_float('subsample', 0.33,0.85),
                colsample_bytree = trial.suggest_float('colsample_bytree',0.33,0.7),
                reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
                reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
            )
            
            xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
            
            pipe = Pipeline(
                steps = [
                    ('scaler', StandardScaler()),
                    ('kmeans', KMeansTransformer(n_clusters=i)),
                    ('model', xgbc)
                ]
            )
            
            score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
            return score
        
        study_lgb = optuna.create_study(direction='maximize')
        
        study_lgb.optimize(objective_xgb, n_trials=1000, n_jobs=-1, show_progress_bar=True)
        
        best_params = study_lgb.best_params
        y_params_dict[yn][i] = best_params
        
        best_score = study_lgb.best_value
        y_scores[yn][i] = best_score
        
        time.sleep(5)
        clear_output()
        
description = 'xgbc experiment with varying n_clusters(8) along with hpt'
main.saver(y_params_dict,y_scores,description)

In [39]:
with open('../../artifacts/10_Mar_17_05_44.json','r') as file:
    par_dict = json.load(file)

In [43]:
par_dict['pastry']['4']

{'n_estimators': 307,
 'max_depth': 63,
 'max_leaves': 25,
 'learning_rate': 0.03157661441708786,
 'gamma': 1.0399818955502331,
 'min_child_weight': 0.7267824719999009,
 'subsample': 0.7429283205677135,
 'colsample_bytree': 0.3483354941019946,
 'reg_alpha': 0.05378652837786252,
 'reg_lambda': 0.025436887505121603}

In [44]:
pastry_params = par_dict['pastry']['4']

pastry_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=4)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**pastry_params))
    ]
)
pastry_m.fit(X,pastry)
pastry_pred = pastry_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [45]:
z_scratch_params = par_dict['z_scratch']['5']

z_scratch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **z_scratch_params))
    ]
)
z_scratch_m.fit(X,z_scratch)
z_scratch_pred = z_scratch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [46]:
k_scatch_params = y_params_dict['k_scatch'][8]

k_scatch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=7)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**k_scatch_params))
    ]
)
k_scatch_m.fit(X,k_scatch)
k_scatch_pred = k_scatch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [47]:
stains_params = par_dict['stains']['4']

stains_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=4)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**stains_params))
    ]
)
stains_m.fit(X,stains)
stains_pred = stains_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [48]:
dirtiness_params = par_dict['dirtiness']['5']

dirtiness_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**dirtiness_params))
    ]
)
dirtiness_m.fit(X,dirtiness)
dirtiness_pred = dirtiness_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [49]:
bumps_params = par_dict['bumps']['7']

bumps_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=7)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**bumps_params))
    ]
)
bumps_m.fit(X,bumps)
bumps_pred = bumps_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [50]:
other_faults_params = par_dict['other_faults']['5']

other_faults_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**other_faults_params))
    ]
)
other_faults_m.fit(X,other_faults)
other_faults_pred = other_faults_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [51]:
sub = pd.DataFrame({'id':test_org['id'].copy(), 'Pastry':pastry_pred, 'Z_Scratch':z_scratch_pred, 'K_Scatch':k_scatch_pred,
                    'Stains':stains_pred, 'Dirtiness':dirtiness_pred, 'Bumps':bumps_pred, 'Other_Faults':other_faults_pred})

In [52]:
sub.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.505621,0.000767,0.005972,0.000113,0.019555,0.167881,0.346796
1,19220,0.250375,0.022483,0.00636,0.000227,0.10361,0.17973,0.322578
2,19221,0.001104,0.049589,0.040669,0.000598,0.005785,0.302705,0.467896
3,19222,0.159196,0.001625,0.000624,0.001228,0.011518,0.317744,0.441201
4,19223,0.002299,0.001828,0.001015,0.008662,0.009405,0.62608,0.384349


In [None]:
# 0.505621+0.000767+0.005444+0.000113+0.019555+0.167881+0.346796

1.0461770000000001

In [53]:
sub.to_csv('../submissions/m10_1.csv', index=False)