In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, log_loss, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, NMF
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.manifold import TSNE
from umap import UMAP
from scipy.cluster.hierarchy import dendrogram, ward

import optuna
from optuna.integration import CatBoostPruningCallback


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)

## How to define the different states of the data

In [3]:
#Variable for the imported training data after cleaning up - "train"
#Variable for the feature data set - "train_X"
#Variable for the target series- "train_y"
#Variable for the  target colum - "target"
#Variable for the imported test data set - "test"
#Variable for the transformed test data set - "test_trans"
#Variable for the training data set in the cross validation loop - "X_train"
#Variable for the validation data set in the cross validation loop - "X_val"
##Variable for the  y_train labels in the cross validation loop



In [4]:
train = pd.read_csv(r'../input/playground-series-s3e13/train.csv')
test_1 = pd.read_csv(r'../input/playground-series-s3e13/test.csv')
orig_train = pd.read_csv(r'../input/vector-borne-disease-prediction/trainn.csv')

train.drop('id', axis = 1, inplace = True)
test = test_1.drop('id', axis = 1)

target = 'prognosis'

In [41]:
 id = test_1["id"].values

In [5]:
train = pd.concat([train, orig_train])
print(f'There are {train.duplicated(subset = list(train)[0:-1]).value_counts()[0]} non-duplicate values out of {train.count()[0]} rows in original train dataset')

There are 959 non-duplicate values out of 959 rows in original train dataset


In [6]:
train.shape

(959, 65)

In [8]:
train_X = train.drop(target, axis = 1).copy()
train_y = train[target]

#Fix the issues that the original data set used "-"to separate the words
train_y = [prognosis.replace(' ', '_') for prognosis in train_y]
train_y = np.array(train_y)

In [10]:
train_X.shape, train_y.shape

((959, 64), (959,))

In [11]:
#Cross validation strategy
seed = 42
splits = 3
#cv = RepeatedStratifiedKFold(n_splits = splits, n_repeats = 5, random_state = seed)
cv = StratifiedKFold(n_splits = splits, random_state = seed, shuffle = True)

np.random.seed(seed)

In [12]:
def apk(actual, predicted, k=10):
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [13]:
def cross_val_pipe(model, train_X = train_X, train_y = train_y, target ='prognosis', cv = cv, label = ''):
    
    
    #creating encoder and transforming prognosis
    enc = LabelEncoder()
    train_y = enc.fit_transform(train_y)
    
    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(train_X), 11)) # Validation predictions are stored in a matrix with length of the number of trainijng samples and # of preds
    train_predictions = np.zeros((len(train_X), 11)) # Same for the train predictions
    train_logloss, val_logloss = [], [] #Store the results from the log_loss calc in a list
    train_map3, val_map3 = [], [] #Store the results from the log_loss calc in a list
    
    #training model, predicting prognosis probability, and evaluating log loss
    for fold, (train_idx, val_idx) in enumerate(cv.split(train_X, train_y)):
                
        model.fit(train_X.iloc[train_idx], train_y[train_idx])
        
        train_preds = model.predict_proba(train_X.iloc[train_idx])
        val_preds = model.predict_proba(train_X.iloc[val_idx])
                  
        train_predictions[train_idx] += train_preds
        val_predictions[val_idx] += val_preds
        
        train_score = log_loss(train_y[train_idx], train_preds)
        val_score = log_loss(train_y[val_idx], val_preds)
        
        train_logloss.append(train_score)
        val_logloss.append(val_score)
        
        #select three most probable prognosis based on train dataset prediction
        train_index = np.argsort(-train_preds)[:,:3] #return index of three most probable prognosis
        
        #select three most probable prognosis based on validation dataset prediction
        val_index = np.argsort(-val_preds)[:,:3]
    
        #calculate map@3
        train_score = mapk(train_y[train_idx].reshape(-1, 1), train_index, 3)
        val_score = mapk(train_y[val_idx].reshape(-1, 1), val_index, 3)
        print(f" The val_score for {fold} is {val_score}")
        
        train_map3.append(train_score)
        val_map3.append(val_score)
    
    print(f'Val log_loss   : {np.mean(val_logloss):.5f} | Train log_loss   : {np.mean(train_logloss):.5f} | {label}')
    print(f'Val MAP@3 Score: {np.mean(val_map3):.5f} | Train MAP@3 Score: {np.mean(train_map3):.5f} | {label}\n')
    
    return val_logloss, val_map3

In [14]:
#Classification models

#List of tuples
models = [
    ('log', LogisticRegression(random_state = seed, max_iter = 10000)),
    #('svc', SVC(random_state = seed, probability = True)),
    #('lda', LinearDiscriminantAnalysis()),
    #('qda', QuadraticDiscriminantAnalysis()),
    #('gauss', GaussianProcessClassifier(random_state = seed)),
    #('et', ExtraTreesClassifier(random_state = seed)),
    #('rf', RandomForestClassifier(random_state = seed)),
    #('xgb', XGBClassifier(random_state = seed, objective = 'multi:softprob', eval_metric = 'map@3')),
    ('lgb', LGBMClassifier(random_state = seed, objective = 'softmax', metric = 'softmax')),
    #('dart', LGBMClassifier(random_state = seed, objective = 'softmax', metric = 'softmax', boosting_type = 'dart')),
    #('cb', CatBoostClassifier(random_state = seed, objective = 'MultiClass', verbose = 0)),
    #('gb', GradientBoostingClassifier(random_state = seed)),
    #('hgb', HistGradientBoostingClassifier(random_state = seed)),
    #('ada', AdaBoostClassifier(random_state = seed)),
    #('knn', KNeighborsClassifier())
]

### Execution

In [15]:
class Decomp:
    def __init__(self, n_components, method="pca", scaler_method='standard'):
        self.n_components = n_components
        self.method = method
        self.scaler_method = scaler_method
        
    def dimension_reduction(self, df):
            
        X_reduced = self.dimension_method(df)
        df_comp = pd.DataFrame(X_reduced, columns=[f'{self.method.upper()}_{_}' for _ in range(self.n_components)], index=df.index)
        
        return df_comp
    
    def dimension_method(self, df):
        X = self.scaler(df)
        if self.method == "pca":
            comp = PCA(n_components=self.n_components, random_state=0)
            X_reduced = comp.fit_transform(X)
        elif self.method == "nmf":
            comp = NMF(n_components=self.n_components, random_state=0)
            X_reduced = comp.fit_transform(X)
        elif self.method == "umap":
            comp = UMAP(n_components=self.n_components, random_state=0)
            X_reduced = comp.fit_transform(X)
        elif self.method == "tsne":
            comp = TSNE(n_components=self.n_components, random_state=0) # Recommend n_components=2
            X_reduced = comp.fit_transform(X)
        else:
            raise ValueError(f"Invalid method name: {method}")
        
        self.comp = comp
        return X_reduced
    
    def scaler(self, df):
        
        _df = df.copy()
            
        if self.scaler_method == "standard":
            return StandardScaler().fit_transform(_df)
        elif self.scaler_method == "minmax":
            return MinMaxScaler().fit_transform(_df)
        elif self.scaler_method == None:
            return _df.values
        else:
            raise ValueError(f"Invalid scaler_method name")
        
    def get_columns(self):
        return [f'{self.method.upper()}_{_}' for _ in range(self.n_components)]
    
    def transform(self, df):
        X = self.scaler(df)
        X_reduced = self.comp.transform(X)
        df_comp = pd.DataFrame(X_reduced, columns=[f'{self.method.upper()}_{_}' for _ in range(self.n_components)], index=df.index)
        
        return df_comp
    @property
        
    def get_explained_variance_ratio(self):
        
        return np.sum(self.comp.explained_variance_ratio_)

In [16]:
def decomp_concat(df):
    global method
    decomp = Decomp(n_components=3, method=method, scaler_method=None)
    df_1 = decomp.dimension_reduction(df).reset_index(drop = True)
    print(f" the shape of df_1 is: {df_1.shape}")
    #df = df.reset_index(inplace = True)
    df = pd.merge(df, df_1, left_index=True, right_index=True)
    print(f" the shape of df_2 2 is: {df.shape}")
    #df = pd.concat([train_X, df], axis=1)
    #df = df.reset_index(inplace = True)
    return df

### Pipeline

In [17]:
def decomp_concat(df):
    global method
    decomp = Decomp(n_components=9, method=method, scaler_method=None)
    df_1 = decomp.dimension_reduction(df).reset_index(drop = True)
    print(f" the shape of df_1 is: {df_1.shape}")
    #df = df.reset_index(inplace = True)
    df = pd.merge(df, df_1, left_index=True, right_index=True)
    print(f" the shape of df_2 2 is: {df.shape}")
    #df = pd.concat([train_X, df], axis=1)
    #df = df.reset_index(inplace = True)
    return df

In [18]:
pca_func_trans = FunctionTransformer(decomp_concat)

In [None]:
pca_func_trans

In [19]:
def reset_index(dataframe):
    dataframe = dataframe.reset_index(inplace = False)
    return dataframe

get_reset_index = FunctionTransformer(reset_index, validate=False)

In [None]:
#Set up a logistic regression pipeline
#No feature engineering part )
#pca_pipe = make_pipeline(PCA(n_components=2, random_state=0), LogisticRegression(random_state = seed, max_iter = 10000))
pca_pipe = make_pipeline( get_reset_index, pca_func_trans, LogisticRegression(random_state = seed, max_iter = 10000))
umap_pipe = make_pipeline(UMAP(n_components=3, random_state=0), LogisticRegression(random_state = seed, max_iter = 10000))

In [None]:
pca_pipe

In [None]:
method = "pca"
cross_val_pipe(pca_pipe)

In [None]:
def objective(trial):
    global train_X
    global train_y
    param = {
        "loss_function": trial.suggest_categorical("loss_function", ["MultiClass"]),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
    }
    # Conditional Hyper-Parameters
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    #creating encoder and transforming prognosis
    enc = LabelEncoder()
    train_y = enc.fit_transform(train_y)
    
    train_x, valid_x, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.25)
                
    model = CatBoostClassifier(**param)
    #pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    
    model.fit(
        train_x,
        y_train,
        eval_set=[(valid_x, y_valid)],
        verbose=0,
        early_stopping_rounds=100
    )
        
         # evoke pruning manually.
    #pruning_callback.check_pruned()
    
    val_preds = model.predict_proba(valid_x)
                  
    #select three most probable prognosis based on validation dataset prediction
    val_index = np.argsort(-val_preds)[:,:3]
    
    #calculate map@3
        
    val_score = mapk(y_valid.reshape(-1, 1), val_index, 3)
    print(f" The val_score for is {val_score}")
        
    return val_score
    

In [None]:
study = optuna.create_study( direction="maximize")
study.optimize(objective, n_trials=20, timeout=600 )

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
cb_best_parmas = study.best_params

In [None]:
with open('cb_best_parma.pickle', 'wb') as f:
    pickle.dump(cb_best_parmas, f)

In [26]:
with open("/kaggle/input/cb-best-parmapickle/cb_best_parma.pickle", 'rb') as file:
# Call load method to deserialze
    cb_best_parmas = pickle.load(file)

In [27]:
cb_best_parmas

{'loss_function': 'MultiClass',
 'learning_rate': 0.021945278841586587,
 'l2_leaf_reg': 0.34063903038627596,
 'colsample_bylevel': 0.037509732185800414,
 'depth': 8,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'MVS',
 'min_data_in_leaf': 16,
 'one_hot_max_size': 17}

#Load the best params

  
# Open the file in binary mode
with open('file.pkl', 'rb') as file:
      
    # Call load method to deserialze
    myvar = pickle.load(file)

In [28]:
best_fit_cb = CatBoostClassifier(**cb_best_parmas)

In [29]:
enc = LabelEncoder()
train_y = enc.fit_transform(train_y)
train_y

array([ 3,  7,  3, 10,  6,  3,  8,  7,  4,  0,  5, 10,  1,  4,  0,  4, 10,
        9,  7,  3,  0,  0,  8, 10,  3,  2,  9,  5,  6,  0,  2,  7,  8,  7,
       10,  0,  7,  6,  2,  8, 10, 10,  1,  1, 10,  0,  9,  0,  3,  9,  7,
        0,  2,  8,  3,  2,  5,  1,  4, 10,  9,  9,  8,  8,  9,  3,  4,  8,
        7,  7,  7, 10,  6,  5,  2,  1, 10,  8,  2,  3,  9,  8,  7,  7,  2,
        5, 10,  5,  6,  5,  0,  8,  3,  0,  2,  7,  1,  2,  1,  1,  7, 10,
        5,  3,  4,  8,  9,  9,  2,  5,  0,  2, 10, 10,  7,  4,  2,  3,  0,
        4,  5,  6,  2,  1,  1,  8,  8,  5,  3,  1,  8,  4,  4,  1,  5,  2,
        8,  2,  0,  0,  4,  1, 10,  1,  6,  5,  1,  8,  8,  8,  7,  1,  6,
        4,  0,  8,  2,  4,  5,  0,  6,  8,  2,  1,  3,  8,  6,  0, 10,  8,
        0,  8,  2,  5,  9,  8,  4,  5,  9,  2,  6,  9,  6,  1,  3,  3,  7,
        6,  0,  1,  8,  2,  3,  0,  2,  0,  4,  6,  7,  4,  2,  7,  2,  9,
        8,  2,  6,  1,  2,  2, 10,  3,  5,  9,  6,  7,  7,  2,  3,  2,  2,
        1,  8,  5,  8,  9

In [30]:
best_fit_cb.fit(train_X, train_y)

0:	learn: 2.3680601	total: 430ms	remaining: 7m 10s
1:	learn: 2.3406358	total: 825ms	remaining: 6m 51s
2:	learn: 2.3224554	total: 886ms	remaining: 4m 54s
3:	learn: 2.2942091	total: 1.1s	remaining: 4m 34s
4:	learn: 2.2797152	total: 1.13s	remaining: 3m 45s
5:	learn: 2.2563050	total: 1.22s	remaining: 3m 22s
6:	learn: 2.2421602	total: 1.25s	remaining: 2m 57s
7:	learn: 2.2212003	total: 1.44s	remaining: 2m 58s
8:	learn: 2.1921109	total: 1.71s	remaining: 3m 8s
9:	learn: 2.1742296	total: 1.9s	remaining: 3m 8s
10:	learn: 2.1716663	total: 1.91s	remaining: 2m 51s
11:	learn: 2.1460113	total: 2.15s	remaining: 2m 57s
12:	learn: 2.1419941	total: 2.16s	remaining: 2m 44s
13:	learn: 2.1379787	total: 2.17s	remaining: 2m 32s
14:	learn: 2.1094590	total: 2.36s	remaining: 2m 35s
15:	learn: 2.0955731	total: 2.39s	remaining: 2m 27s
16:	learn: 2.0681498	total: 2.6s	remaining: 2m 30s
17:	learn: 2.0467010	total: 2.71s	remaining: 2m 27s
18:	learn: 2.0212853	total: 2.91s	remaining: 2m 30s
19:	learn: 2.0154577	total:

<catboost.core.CatBoostClassifier at 0x7111b4c5b390>

In [31]:
test_pred = best_fit_cb.predict_proba(test)

In [32]:
# Get the sorted indices of predictions and take the top 3
test_sorted_prediction_ids = np.argsort(-test_pred, axis=1)
test_top_3_prediction_ids = test_sorted_prediction_ids[:,:3]

In [33]:
original_shape = test_top_3_prediction_ids.shape
original_shape

(303, 3)

In [34]:
test_top_3_predictions = enc.inverse_transform(test_top_3_prediction_ids.reshape(-1, 1))
test_top_3_predictions.shape
test_top_3_predictions = test_top_3_predictions.reshape(original_shape)
test_top_3_predictions.shape

(303, 3)

In [35]:
test_top_3_predictions[0:2]

array([['Tungiasis', 'Rift_Valley_fever', 'Japanese_encephalitis'],
       ['Dengue', 'Plague', 'Chikungunya']], dtype='<U21')

In [36]:
test['prognosis'] = np.apply_along_axis(lambda x: np.array(' '.join(x), dtype="object"), 1, test_top_3_predictions)


In [44]:
test['prognosis'].shape


(303,)

In [48]:
id

array([ 707,  708,  709,  710,  711,  712,  713,  714,  715,  716,  717,
        718,  719,  720,  721,  722,  723,  724,  725,  726,  727,  728,
        729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,
        740,  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,
        751,  752,  753,  754,  755,  756,  757,  758,  759,  760,  761,
        762,  763,  764,  765,  766,  767,  768,  769,  770,  771,  772,
        773,  774,  775,  776,  777,  778,  779,  780,  781,  782,  783,
        784,  785,  786,  787,  788,  789,  790,  791,  792,  793,  794,
        795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
        806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,
        817,  818,  819,  820,  821,  822,  823,  824,  825,  826,  827,
        828,  829,  830,  831,  832,  833,  834,  835,  836,  837,  838,
        839,  840,  841,  842,  843,  844,  845,  846,  847,  848,  849,
        850,  851,  852,  853,  854,  855,  856,  8

In [54]:
gh = np.column_stack((test['prognosis'], id))
sub = pd.DataFrame(gh)

In [56]:
sub.columns = [ 'prognosis', 'id']

In [57]:
sub


Unnamed: 0,prognosis,id
0,Tungiasis Rift_Valley_fever Japanese_encephalitis,707
1,Dengue Plague Chikungunya,708
2,West_Nile_fever Japanese_encephalitis Zika,709
3,Japanese_encephalitis Rift_Valley_fever Tungiasis,710
4,West_Nile_fever Malaria Zika,711
...,...,...
298,Zika Yellow_Fever West_Nile_fever,1005
299,Malaria Lyme_disease Plague,1006
300,Plague Lyme_disease West_Nile_fever,1007
301,Rift_Valley_fever West_Nile_fever Japanese_enc...,1008


In [59]:
sub.to_csv('submission.csv', columns=['id', 'prognosis'], index=False)