In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from IPython.display import clear_output
import time
import catboost
import re
import optuna
import json
import sys
sys.path.append('../..')
import main

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv(r'../data/Faults.NNA', delimiter='\\s+', names=col_names)

X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scatch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scatch, stains, dirtiness, bumps, other_faults]
y_names = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]
    
class PCA_Transformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_components):
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components, random_state=0)
        
    def fit(self, X, y=None):
        self.pca.fit(X)
        return self
    
    def transform(self,X):
        cols = self.pca.transform(X)
        return np.c_[X, cols]

# n_clusters(2,3)

In [2]:
y_params_dict = {}
for y_nm in y_names:
    y_params_dict[y_nm] = {}
    for i in [2,3]:
        if i ==1:
            continue
        y_params_dict[y_nm][i] = None

In [3]:
y_params_dict

{'pastry': {2: None, 3: None},
 'z_scratch': {2: None, 3: None},
 'k_scatch': {2: None, 3: None},
 'stains': {2: None, 3: None},
 'dirtiness': {2: None, 3: None},
 'bumps': {2: None, 3: None},
 'other_faults': {2: None, 3: None}}

In [4]:
y_scores = {}
for yns in y_names:
    y_scores[yns] = {}
    for n in [2,3]:
        y_scores[yns][n] = None

In [5]:
cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for y,yn in zip(ys,y_names):
    for n,i in enumerate([2,3]):
        
        print(f'trial: {yn}(n_cluster={i})')
        
        def objective_xgb(trial):
    
            cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
            params = dict(
                n_estimators = trial.suggest_int('n_estimators',100,500),
                max_depth = trial.suggest_int('max_depth',2,64),
                max_leaves = trial.suggest_int('max_leaves',2,128),
                learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
                gamma = trial.suggest_float('gamma',0.001,10),
                min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
                subsample = trial.suggest_float('subsample', 0.33,0.85),
                colsample_bytree = trial.suggest_float('colsample_bytree',0.33,0.7),
                reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
                reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
            )
            
            xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
            
            pipe = Pipeline(
                steps = [
                    ('scaler', StandardScaler()),
                    ('kmeans', KMeansTransformer(n_clusters=i)),
                    ('model', xgbc)
                ]
            )
            
            score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
            return score
        
        study_lgb = optuna.create_study(direction='maximize')
        
        study_lgb.optimize(objective_xgb, n_trials=1000, n_jobs=-1, show_progress_bar=True)
        
        best_params = study_lgb.best_params
        y_params_dict[yn][i] = best_params
        
        best_score = study_lgb.best_value
        y_scores[yn][i] = best_score
        
        time.sleep(5)
        clear_output()
        
description = 'xgbc experiment with varying n_clusters(2,3) along with hpt'
main.saver(y_params_dict,y_scores,description)

In [22]:
with open('../../artifacts/10_Mar_17_05_48.json','r') as file:
    res_dict_1 = json.load(file)
    
with open('../../artifacts/10_Mar_21_53_08.json','r') as file:
    res_dict_2 = json.load(file)
    
with open('../../artifacts/11_Mar_14_11_47.json','r') as file:
    res_dict_3 = json.load(file)

In [23]:
for key in res_dict_1.keys():
    for sub_key,value in res_dict_3[key].items():
        res_dict_1[key][sub_key] = value
        
for key in res_dict_1.keys():
    for sub_key,value in res_dict_2[key].items():
        res_dict_1[key][sub_key] = value

In [24]:
res_dict_1

{'pastry': {'4': 0.87316342526509,
  '5': 0.8723859380558331,
  '7': 0.8718382015595232,
  '2': 0.872368673985698,
  '3': 0.8720512972515024,
  '8': 0.8718805289258797},
 'z_scratch': {'4': 0.9613890153078358,
  '5': 0.9615976308667188,
  '7': 0.9612414577306605,
  '2': 0.9614434244435696,
  '3': 0.9612456383409917,
  '8': 0.9610154842078904},
 'k_scatch': {'4': 0.9860858535887431,
  '5': 0.9860992844372543,
  '7': 0.9861330613828573,
  '2': 0.9861501596835595,
  '3': 0.9860706017688351,
  '8': 0.9861393751670559},
 'stains': {'4': 0.9929431745506421,
  '5': 0.9928315474707956,
  '7': 0.992771472558994,
  '2': 0.9927847507110557,
  '3': 0.992721780865689,
  '8': 0.9927694558697684},
 'dirtiness': {'4': 0.896228905082517,
  '5': 0.8965788929674312,
  '7': 0.8962361142448083,
  '2': 0.8966721070524626,
  '3': 0.8973262934703824,
  '8': 0.8963041608221399},
 'bumps': {'4': 0.8112450626879427,
  '5': 0.8113776963214827,
  '7': 0.8117126719857447,
  '2': 0.8115072931400666,
  '3': 0.8117351

In [27]:
fres = 0
for key in res_dict_1.keys():
    res = 0
    n_c = 0
    for sub_key in res_dict_1[key].keys():
        if res_dict_1[key][sub_key] > res:
            res = res_dict_1[key][sub_key]
            n_c = int(sub_key)
    
    fres += res        
    print(f'{key}, n_cluster:{n_c}, score: {res}')
print(f'\nfinal_score: {fres/7}')

pastry, n_cluster:4, score: 0.87316342526509
z_scratch, n_cluster:5, score: 0.9615976308667188
k_scatch, n_cluster:2, score: 0.9861501596835595
stains, n_cluster:4, score: 0.9929431745506421
dirtiness, n_cluster:3, score: 0.8973262934703824
bumps, n_cluster:3, score: 0.8117351598399493
other_faults, n_cluster:3, score: 0.7103388093664101

final_score: 0.890464950434679


In [31]:
with open('../../artifacts/10_Mar_17_05_44.json','r') as file:
    prms_dict_1 = json.load(file)
    
with open('../../artifacts/10_Mar_21_53_04.json','r') as file:
    prms_dict_2 = json.load(file)
    
with open('../../artifacts/11_Mar_14_11_43.json','r') as file:
    prms_dict_3 = json.load(file)

In [32]:
for key in prms_dict_1.keys():
    for sub_key,value in prms_dict_3[key].items():
        prms_dict_1[key][sub_key] = value
        
for key in prms_dict_1.keys():
    for sub_key,value in prms_dict_2[key].items():
        prms_dict_1[key][sub_key] = value

In [33]:
pastry_params = prms_dict_1['pastry']['4']

pastry_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=4)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**pastry_params))
    ]
)
pastry_m.fit(X,pastry)
pastry_pred = pastry_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [35]:
z_scratch_params = prms_dict_1['z_scratch']['5']

z_scratch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **z_scratch_params))
    ]
)
z_scratch_m.fit(X,z_scratch)
z_scratch_pred = z_scratch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [36]:
k_scatch_params = prms_dict_1['k_scatch']['2']

k_scatch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=7)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**k_scatch_params))
    ]
)
k_scatch_m.fit(X,k_scatch)
k_scatch_pred = k_scatch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [38]:
stains_params = prms_dict_1['stains']['4']

stains_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=4)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**stains_params))
    ]
)
stains_m.fit(X,stains)
stains_pred = stains_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [39]:
dirtiness_params = prms_dict_1['dirtiness']['3']

dirtiness_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**dirtiness_params))
    ]
)
dirtiness_m.fit(X,dirtiness)
dirtiness_pred = dirtiness_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [40]:
bumps_params = prms_dict_1['bumps']['3']

bumps_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=7)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**bumps_params))
    ]
)
bumps_m.fit(X,bumps)
bumps_pred = bumps_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [41]:
other_faults_params = prms_dict_1['other_faults']['3']

other_faults_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans',KMeansTransformer(n_clusters=5)),
        ('model', xgb.XGBClassifier(random_state= 0, objective='binary:logistic',**other_faults_params))
    ]
)
other_faults_m.fit(X,other_faults)
other_faults_pred = other_faults_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [42]:
sub = pd.DataFrame({'id':test_org['id'].copy(), 'Pastry':pastry_pred, 'Z_Scratch':z_scratch_pred, 'K_Scatch':k_scatch_pred,
                    'Stains':stains_pred, 'Dirtiness':dirtiness_pred, 'Bumps':bumps_pred, 'Other_Faults':other_faults_pred})

In [43]:
sub.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.505621,0.000767,0.005441,0.000113,0.014624,0.158008,0.322089
1,19220,0.250375,0.022483,0.006838,0.000227,0.136549,0.143548,0.32928
2,19221,0.001104,0.049589,0.041254,0.000598,0.004451,0.28264,0.481709
3,19222,0.159196,0.001625,0.000737,0.001228,0.006225,0.360755,0.431707
4,19223,0.002299,0.001828,0.00117,0.008662,0.006621,0.580453,0.389343


In [26]:
# 0.505621+0.000767+0.005444+0.000113+0.019555+0.167881+0.346796

1.0461770000000001

In [44]:
sub.to_csv('../submissions/m11.csv', index=False)

# n_clusters(6)

In [45]:
y_params_dict = {}
for y_nm in y_names:
    y_params_dict[y_nm] = {}
    for i in [6]:
        if i ==1:
            continue
        y_params_dict[y_nm][i] = None

In [46]:
y_params_dict

{'pastry': {6: None},
 'z_scratch': {6: None},
 'k_scatch': {6: None},
 'stains': {6: None},
 'dirtiness': {6: None},
 'bumps': {6: None},
 'other_faults': {6: None}}

In [47]:
y_scores = {}
for yns in y_names:
    y_scores[yns] = {}
    for n in [6]:
        y_scores[yns][n] = None

In [48]:
cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for y,yn in zip(ys,y_names):
    for i in [6]:
        
        print(f'trial: {yn}(n_cluster={i})')
        
        def objective_xgb(trial):
    
            cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
            params = dict(
                n_estimators = trial.suggest_int('n_estimators',100,500),
                max_depth = trial.suggest_int('max_depth',2,64),
                max_leaves = trial.suggest_int('max_leaves',2,128),
                learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
                gamma = trial.suggest_float('gamma',0.001,10),
                min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
                subsample = trial.suggest_float('subsample', 0.33,0.85),
                colsample_bytree = trial.suggest_float('colsample_bytree',0.33,0.7),
                reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
                reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
            )
            
            xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
            
            pipe = Pipeline(
                steps = [
                    ('scaler', StandardScaler()),
                    ('kmeans', KMeansTransformer(n_clusters=i)),
                    ('model', xgbc)
                ]
            )
            
            score = np.mean(cross_val_score(pipe, X,y, scoring='roc_auc', cv= cvo))
            return score
        
        study_lgb = optuna.create_study(direction='maximize')
        
        study_lgb.optimize(objective_xgb, n_trials=1000, n_jobs=-1, show_progress_bar=True)
        
        best_params = study_lgb.best_params
        y_params_dict[yn][i] = best_params
        
        best_score = study_lgb.best_value
        y_scores[yn][i] = best_score
        
        time.sleep(5)
        clear_output()
        
description = 'xgbc experiment with varying n_clusters(6) along with hpt'
main.saver(y_params_dict,y_scores,description)

In [39]:
with open('../../artifacts/10_Mar_17_05_44.json','r') as file:
    par_dict = json.load(file)

In [49]:
with open('../../artifacts/10_Mar_17_05_44.json','r') as file:
    prms_dict_1 = json.load(file)
    
with open('../../artifacts/10_Mar_21_53_04.json','r') as file:
    prms_dict_2 = json.load(file)
    
with open('../../artifacts/11_Mar_14_11_43.json','r') as file:
    prms_dict_3 = json.load(file)
    
with open('../../artifacts/11_Mar_20_57_46.json','r') as file:
    prms_dict_4 = json.load(file)

In [50]:
for key in prms_dict_1.keys():
    for sub_key,value in prms_dict_3[key].items():
        prms_dict_1[key][sub_key] = value
        
for key in prms_dict_1.keys():
    for sub_key,value in prms_dict_2[key].items():
        prms_dict_1[key][sub_key] = value
        
for key in prms_dict_1.keys():
    for sub_key,value in prms_dict_4[key].items():
        prms_dict_1[key][sub_key] = value

In [52]:
with open('../../artifacts/n_clusters_params.json','w') as f:
    json.dump(prms_dict_1,f)

In [53]:
with open('../../artifacts/10_Mar_17_05_48.json','r') as file:
    res_dict_1 = json.load(file)
    
with open('../../artifacts/10_Mar_21_53_08.json','r') as file:
    res_dict_2 = json.load(file)
    
with open('../../artifacts/11_Mar_14_11_47.json','r') as file:
    res_dict_3 = json.load(file)
    
with open('../../artifacts/11_Mar_20_57_50.json','r') as file:
    res_dict_4 = json.load(file)

In [54]:
for key in res_dict_1.keys():
    for sub_key,value in res_dict_3[key].items():
        res_dict_1[key][sub_key] = value
        
for key in res_dict_1.keys():
    for sub_key,value in res_dict_2[key].items():
        res_dict_1[key][sub_key] = value
        
for key in res_dict_1.keys():
    for sub_key,value in res_dict_4[key].items():
        res_dict_1[key][sub_key] = value

In [56]:
with open('../../artifacts/n_clusters_results.json','w') as f:
    json.dump(res_dict_1,f)