# TPS_May_2022_EDA_DTC->RFC->LGB->XGB

In [None]:
import numpy as np 
import pandas as pd 

# PLOT
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import OrderedDict

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score, roc_curve, auc

from xgboost import XGBClassifier
import xgboost as xgb

# Read file
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')

# EDA 📊

In [None]:
train.head().T

In [None]:
def check(df):
    col_list = df.columns.values
    rows = []
    for col in col_list:
        tmp = (col,
              df[col].dtype,
              df[col].isnull().sum(),
              df[col].count(),
              df[col].nunique(),
              df[col].unique())
        rows.append(tmp)
    df = pd.DataFrame(rows) 
    df.columns = ['feature','dtype','nan','count','nunique','unique']
    return df

In [None]:
check(train)

In [None]:
def color_negative_red(val):
    color = 'red' if val < 0 else 'black'
    return 'color: %s' % color

In [None]:
cm = sns.light_palette('green', as_cmap = True)
train.drop('id', axis = 1).describe().T.style.background_gradient(cmap = cm).applymap(color_negative_red)

++++++++++++++++++++++++++++++++++

In [None]:
test.head().T

In [None]:
check(test)

In [None]:
test.drop('id', axis = 1).describe().T.style.background_gradient(cmap = cm).applymap(color_negative_red)

* train count is 900000. no null.
* test count is 700000. no null. the id of teat is the number of the train contination.
* 
* f_0 to f_6 range is  about -5 to 14(float64)
* f_07 to f_18 range is about 0 to 16 (int64)
* f_19 to f_26 range is about -12 to 12.5(float64)
* f_28 range is about -1230 to 1230(float64)
* f_27 is object. 
> *  Train's unique counts is 741354. train's most common is BBBBBBCJBC(12). string length is 10.
> *  Test's unique counts is 598482. test's most common is BBBBBABGCC(9).  string length is 10.
* f_29 range is 0 to 1(int64)
* f_30 range  0 to 2(int64)

+++++++++++++++++++++++++++++++++

In [None]:
target_count = train['target'].value_counts()
target_count

In [None]:
train['target'].describe()

In [None]:
sns.set('talk', 'dark','spring_r')

fig,axs = plt.subplots(ncols = 2, )

sns.countplot(x=train['target'], data=train, ax = axs[0])

labels = ['1','0']
plt.pie(target_count, labels = labels, autopct = '%.0f%%')

plt.show()

target show almost the same counts.

+++++++++++++++++++++++++++++++++++
# Consideration of the 'f_27'👀


In [None]:
train['f_27'].value_counts()

In [None]:
test['f_27'].value_counts()

* It is too much value_counts, isn't it?
* So...I came up with an idea 💡 from past TPS(TabrularPlaygroundSeries_Feb_2022).It's is the DNA's BOS(Block Optical Sequenceing)Exsample: ATATGGCCTT --> A2T2G2C2
* Then I tried this kind of encode using RLE(Run Length encode).

In [None]:
from collections import OrderedDict

def encord(input):
    dict = OrderedDict.fromkeys(input,0)
        
    for ch in input:
        dict[ch] += 1
        
    output = ''
    for k, v in dict.items():
        output = output + k + str(v)
    return output
        

In [None]:
f_27_en=[]
for i in range(len(train['f_27'])):
    a = train['f_27'][i]
    st = encord(a)
    f_27_en.append(st)
    
train['f_27_en'] = f_27_en
train['f_27_en'].value_counts() 

In [None]:
f_27_ent=[]
for i in range(len(test['f_27'])):
    a = test['f_27'][i]
    st = encord(a)
    f_27_ent.append(st)
    
test['f_27_ent'] = f_27_ent
test['f_27_ent'].value_counts() 

In [None]:
label = LabelEncoder()

en_27 = pd.DataFrame(label.fit_transform(train['f_27']))
train['en_27'] = en_27

enc_27 = pd.DataFrame(label.fit_transform(train['f_27_en']))
train['f_27_enc'] = enc_27

enct_27 = pd.DataFrame(label.fit_transform(test['f_27_ent']))
test['f_27_enc'] = enct_27

display(train['en_27'].head(10))
display(train['f_27_enc'].head(10))
display(test['f_27_enc'].head(10))

train['en_27'] is without RLE. train['f_27_enc'] and test['f_27_enc'] are with RLE.

In [None]:
train.head().T

In [None]:
test.head().T

* With this encoding(RLE), the value_counts_length was reduced by about 1/3.
* The result of the score with and without encoding(RLE) showed better results with encoding(RLE).


----------------------------------
# Decision Tree 🌿
* I have visualized some of the data.
* gini impurity is an important measure used to construct the decision trees.
* It is a function that determines how well a decision tree was split. 
* Gini impurity range values from 0 to 0.5.


In [None]:
def gini(samples):
    samples_sum = sum(samples)
    score = 0
    probas = []
    for sample in samples:
        proba = sample / samples_sum
        probas.append(proba)
    for proba in probas:
        score += proba * ( 1 - proba)
    return score
        

In [None]:
x = train.target.value_counts()
gini([x[0],x[1]])

In [None]:
x = train[train['f_29'] <= 0.5]

data = x.target.value_counts()
print(x.shape)
print(data)
gini([data[0],data[1]])

In [None]:
x = train[train['f_29'] > 0.5]

data = x.target.value_counts()
print(x.shape)
print(data)
gini([data[0],data[1]])

* train['f_29'] is 0 or 1. Hence it is an easy-to-understand example.
>  [reference: youtube @ Abhishek Thakur: What Are Decision Tree..](https://www.youtube.com/watch?v=1DMWkIJRivo)

+++++++++++++++++++++++++++++++++++++
* Sklearn has a useful method called plot_tree.

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
feat = ['en_27']
clf = DecisionTreeClassifier(max_depth=3)
clf.fit( train[feat],train["target"] )
_, ax = plt.subplots(figsize=(20, 10))

plot_tree(
    clf,
    feature_names = feat,
    class_names=train["target"].unique().astype(str),
    filled=True,
    ax=ax,
    fontsize=13
)

plt.show()

In [None]:
feat = ['f_26','f_28']
clf = DecisionTreeClassifier(max_depth=7)
clf.fit( train[feat], train["target"])
_, ax = plt.subplots(figsize=(30, 20))

plot_tree(
    clf,
    feature_names= feat,
    class_names=train["target"].unique().astype(str),
    filled=True,
    ax=ax,
    fontsize=15,
    rounded =  True,
)

plt.show()

In [None]:
feat = ['f_28','f_29']
clf = DecisionTreeClassifier(max_depth=5)
clf.fit( train[feat], train["target"])
_, ax = plt.subplots(figsize=(30, 15))

plot_tree(
    clf,
    feature_names= feat,
    class_names=train["target"].unique().astype(str),
    filled=True,
    ax=ax,
    fontsize=15,
    rounded =  True,
)

plt.show()

In [None]:
feat = ['f_29','f_30']
lf = DecisionTreeClassifier(max_depth=3)
clf.fit( train[feat], train["target"])
_, ax = plt.subplots(figsize=(20, 10))

plot_tree(
    clf,
    feature_names=feat,
    class_names=train["target"].unique().astype(str),
    filled=True,
    ax=ax,
    fontsize=15,
    rounded = True
)

plt.show()


* 'f_26' is -14.3 to 12.9. mean is 0.36.
* 'f_28' is -1230 to 1230. mean is -0.38. 
* 'f_29' is 0 to 1. mean is 3.46.
* 'f_30' is 0 to 2. mean is 1.00.
* ~~It seems to be well divided except for f_28.~~
* It seems that some of them cannot be split well.


----------------------------------
# Model⚙
* You may already know that the method of dividing 'f_27' into 10 segments seems to sore better.😉
* like this -> BBBBBBCJBC -->B,B,B,B,B,C,J,B,C ---> 'f_27_0' = 'B','f_27_1' = 'B'...'f_27_9'='C'.
* I tried this method and improved the score.

In [None]:
for df in [train, test]:
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
        
    # unique_characters feature is from https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model
    df["unique_ch"] = df.f_27.apply(lambda s: len(set(s)))

In [None]:
#Memory reduce
for col in train.columns:
    if train[col].dtype == "float64":
        train[col]=pd.to_numeric(train[col], downcast="float")
    if train[col].dtype == "int64":
        train[col]=pd.to_numeric(train[col], downcast="integer")
        
for col in test.columns:
    if test[col].dtype == "float64":
        test[col]=pd.to_numeric(test[col], downcast="float")
    if test[col].dtype == "int64":
        test[col]=pd.to_numeric(test[col], downcast="integer")

In [None]:
train.head().T,test.head().T

In [None]:
train.info(),test.info()

In [None]:
X = train.drop(['id','target','f_27','en_27','f_27_en','f_27_enc'], axis = 1).copy()

y = train['target'].copy()
X_test = test.drop(['id','f_27','f_27_ent','f_27_enc' ], axis = 1).copy()

del train
del test

+++++++++++++++++++++++++++++++++++
# DecisionTreeClassifier

In [None]:
splits = 10
seed = 42
skf = StratifiedKFold(n_splits = splits, shuffle=True, random_state=seed)

preds_dtc = []
scores_dtc = []

for fold, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = DecisionTreeClassifier(random_state = seed)
    
    model.fit(X_train,y_train)
              
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr) 
    scores_dtc.append(score)

    test_preds_dtc = model.predict_proba(X_test)[:,1]
    preds_dtc.append(test_preds_dtc)
    
    print("fold : ", fold , "score : ", score)



In [None]:
print(scores_dtc)

In [None]:
print(sum(scores_dtc)/splits)

+++++++++++++++++++++++++++++++++++
# XGBClassifier

In [None]:
param = { 
            'booster':'gbtree',
            'objective': "binary:logistic",
            'eval_metric': 'auc',
            'tree_method': 'gpu_hist',
            'gpu_id' : 0,
            'predictor':"gpu_predictor",
            'n_estimators': 10000,
            'use_label_encoder': False,
}

In [None]:
param1 = {'learning_rate': 0.14493824363038896,
          'reg_lambda': 4.784303342726584e-07, 
          'reg_alpha': 9.813301815860274e-05, 
          'subsample': 0.401137366202445, 
          'colsample_bytree': 0.9401701404401414, 
          'max_depth': 7}

In [None]:
param2 = {'learning_rate': 0.05882788142426518,
          'reg_lambda': 57.629474368571174, 
          'reg_alpha': 0.03679834894298202, 
          'subsample': 0.7864508832415595, 
          'colsample_bytree': 0.9680582321937043, 
          'max_depth': 6}

In [None]:
param3 = {'learning_rate': 0.03916787068048087,
          'reg_lambda': 0.0016972621937726656, 
          'reg_alpha': 0.0003502105872886548, 
          'subsample': 0.8326888579135895,
          'colsample_bytree': 0.6529342145246444, 
          'max_depth': 6}

In [None]:
param4 = {'learning_rate': 0.019271227602183444, 
          'reg_lambda': 9.103629021190214e-08, 
          'reg_alpha': 0.0007216860838323866, 
          'subsample': 0.4692015709480831,
          'colsample_bytree': 0.36538733031046233, 
          'max_depth': 7}

In [None]:
param5 = {'learning_rate': 0.13215574191420876,
          'reg_lambda': 3.1118740331065817, 
          'reg_alpha': 5.413953508625706e-07, 
          'subsample': 0.7176362598879156, 
          'colsample_bytree': 0.6233076203622315, 
          'max_depth': 4}

In [None]:
param6 = {'learning_rate': 0.01265944131445967, 
          'reg_lambda': 0.00014806463983070133,
          'reg_alpha': 2.8169551175252048, 
          'subsample': 0.9639438526003187, 
          'colsample_bytree': 0.4339827134350426,
          'max_depth': 7}

In [None]:
param7 = {'learning_rate': 0.18386788447399519, 
          'reg_lambda': 1.3262878835680218e-08, 
          'reg_alpha': 1.4805877400183975e-07,
          'subsample': 0.4295815397172502,
          'colsample_bytree': 0.630522752852306, 
          'max_depth': 4}

In [None]:
param8 = {'learning_rate': 0.11186540491879327,
          'reg_lambda': 3.3081502122919044e-08, 
          'reg_alpha': 0.01819904909825177, 
          'subsample': 0.8307778067716517, 
          'colsample_bytree': 0.8151892493370898,
          'max_depth': 7}

In [None]:
param9 ={'learning_rate': 0.03591811951338567,
         'reg_lambda': 0.04125693404708314, 
         'reg_alpha': 0.00030530999525863367,
         'subsample': 0.7544564127811281, 
         'colsample_bytree': 0.425456940862424, 
         'max_depth': 7}

In [None]:
param10 = {'learning_rate': 0.05977357891682924, 
           'reg_lambda': 4.3143104524709134e-07, 
           'reg_alpha': 2.6492913214796612e-05, 
           'subsample': 0.8382348521543115,
           'colsample_bytree': 0.42479459762196636, 
           'max_depth': 6}

In [None]:
splits = 10
seed = 42
skf = StratifiedKFold(n_splits = splits, shuffle=True, random_state=seed)

pred_xgb = []
scores_xgb = []

for fold, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    if fold == 0:
        params_1 = param1
    elif fold == 1:
        params_1 = param2
    elif fold == 2:
        params_1 = param3
    elif fold == 3:
        params_1 = param4
    elif fold == 4:
        params_1 = param5
    elif fold == 5:
        params_1 = param6
    elif fold == 6:
        params_1 = param7
    elif fold == 7:
        params_1 = param8
    elif fold == 8:
        params_1 = param9
    elif fold == 9:
        params_1 = param10

    model = XGBClassifier(**param,**params_1)                     
    model.fit( X_train, y_train, eval_set = [(X_train,y_train),(X_valid, y_valid)], verbose=False )   
    
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores_xgb.append(score)

    test_preds = model.predict_proba(X_test)[:,1]
    pred_xgb.append(test_preds)
    print("fold : ", fold , " score : ", score)

In [None]:
print(scores_xgb)

In [None]:
print(sum(scores_xgb )/splits)

----------------------------------
# Submission🎯

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv')

In [None]:
predictions = np.mean(np.column_stack(pred_xgb), axis = 1)
sub['target'] = np.round(predictions,3)
sub.to_csv('submission.csv', index = False)
sub.head()

In [None]:
target_counts = sub.target.value_counts(sort=False).reset_index().rename(columns={"target":"count","index":"target"})

In [None]:
sns.scatterplot(data = target_counts, x="target", y="count")
plt.title("target")
plt.show()

In [None]:
#sub['target'].plot(kind= "hist",figsize= (8,6))
#plt.show()

---------------------------------
# Summary
* I have tried several of the same encoding(RLE) and the results are shown below.
1. HistGradientBoostingClassifier(with StratifiedKFold) : 0.90072
1. RandomForestClassifier (testsize = 0.25): 0.88849
1. GradientBoostingClassifier(with StratifiedKFold) : 0.82788
1. XGBoost (with StratifiedKFold) : 0.941(5folds) -> 0.9458(10folds)--> 0.9903(10folds, using the best parameters for each individual fold)
1. DecisionTreeClassifier(with StratifiedKFold(10)is better than no Kfold) : 0.81224 -> 0.7724(using f27-individual method )

* Changing the parameters or adding the scaler might give a slightly better results. 
* I will try other models.
> Thank you for reading!Please don't forget to blend...Good Luck!
> > in progress..
