# **<span style='color:#A80808'>🎯 Goal</span>**

Classify 10 different bacteria species using data from a genomic analysis.

# **<span style='color:#A80808'>🔑 Metric</span>**

Submissions will be evaluated based on their categorization accuracy.

# **<span style='color:#A80808'>💾 Data</span>**

Files
* train.csv - the training set, which contains the spectrum of 10-mer histograms for each sample
* test.csv - the test set; your task is to predict the bacteria species (target) for each row_id
* sample_submission.csv - a sample submission file in the correct format

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import random
from math import factorial
import joblib

from sklearn.ensemble import ExtraTreesClassifier as et
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

import warnings
warnings.simplefilter('ignore')

# Train

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
train.head()

In [None]:
print(f'Shape of train: {train.shape}')
print(f'Info of train:\n{train.info()}')

In [None]:
# There are many duplicated rows
print(f'Number of duplicated rows:\n{train.duplicated().sum()}')

In [None]:
# Reduce memory by converting features to float32
features = [col for col in train.columns if col not in ['row_id', 'target']]
train[features] = train[features].astype('float32')

# Target

In [None]:
# Name of the ten classes
train.target.unique()

In [None]:
# Convert target label to integer
le = LabelEncoder()
train['target_num'] = le.fit_transform(train.target) 

In [None]:
# Distribution of ten classes
plt.figure(figsize=(10,7))
plt.bar(list(train.target.unique()),train.groupby('target_num').size(), color='orange')
plt.xticks(rotation=90, fontsize=16)
plt.ylabel('Frequence', fontsize=16)
plt.show()

# Features

In [None]:
# Describe ten first features
train[features[:10]].describe().loc[['mean', 'std', 'min', 'max']]

The first feature is interesting as its std is almost zero. Indeed, it has very few unique values. Only one unique value is negative as shown below

In [None]:
train[features[0]].unique()

In [None]:
# Correlation between the first ten features
plt.figure(figsize=(10,7))
sns.heatmap(train[features[:10]].corr(), annot=True)
plt.show()

# Principal Component Analysis (PCA)

In [None]:
pca = PCA(n_components=3)
pca_features = pca.fit_transform(train[features])

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(pca_features[:,0],pca_features[:,1], c=train.target_num, cmap='tab10')
plt.title('Principal plane 0-1')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(pca_features[:,0],pca_features[:,2], c=train.target_num, cmap='tab10')
plt.title('Principal plane 0-2')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(pca_features[:,1],pca_features[:,2], c=train.target_num, cmap='tab10')
plt.title('Principal plane 1-2')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(pca_features[:,0],pca_features[:,2], c=train.target_num, cmap='tab10')
plt.title('Principal plane 0-2')
plt.show()

# Distribution of each class on the principal planes

In [None]:
plt.figure(figsize=(15,35))
for i in range(10):
    plt.subplot(5,2,i+1)
    plt.scatter(pca_features[:,0][train.target_num==i],pca_features[:,1][train.target_num==i])
    plt.title(f'Distribution of {train.target.unique()[i]} on the principal plane 0-1')
    plt.xlim(-0.1,0.1)
    plt.ylim(-0.1,0.1)
plt.show()

In [None]:
plt.figure(figsize=(15,35))
for i in range(10):
    plt.subplot(5,2,i+1)
    plt.scatter(pca_features[:,0][train.target_num==i],pca_features[:,2][train.target_num==i])
    plt.title(f'Distribution of {train.target.unique()[i]} on the principal plane 0-2')
    plt.xlim(-0.1,0.1)
    plt.ylim(-0.1,0.1)
plt.show()

In [None]:
plt.figure(figsize=(15,35))
for i in range(10):
    plt.subplot(5,2,i+1)
    plt.scatter(pca_features[:,1][train.target_num==i],pca_features[:,2][train.target_num==i])
    plt.title(f'Distribution of {train.target.unique()[i]} on the principal plane 1-2')
    plt.xlim(-0.1,0.1)
    plt.ylim(-0.1,0.1)
plt.show()

# Dual comparison

These two classes are quite well separated on the principal planes 0-1 and 0-2 as shown below

In [None]:
selection = ((train.target_num == 1) | (train.target_num == 4))

plt.figure(figsize=(20,6))
plt.subplot(1,3,1)
plt.scatter(pca_features[:,0][selection],pca_features[:,1][selection], c=train.target_num[selection], cmap='tab10')
plt.title(f'Plane 0-1')
plt.xlim(-0.1,0.1)
plt.ylim(-0.1,0.1)

plt.subplot(1,3,2)
plt.scatter(pca_features[:,0][selection],pca_features[:,2][selection], c=train.target_num[selection], cmap='tab10')
plt.title(f'Plane 0-2')
plt.xlim(-0.1,0.1)
plt.ylim(-0.1,0.1)

plt.subplot(1,3,3)
plt.scatter(pca_features[:,1][selection],pca_features[:,2][selection], c=train.target_num[selection], cmap='tab10')
plt.title(f'Plane 1-2')
plt.xlim(-0.1,0.1)
plt.ylim(-0.1,0.1)
plt.show()

# Greatest Common Divisor (GCD)

In [None]:
# Credit @Ambrosm
def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_i = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int)
                        for col in features})

train_i['gcd'] = np.gcd.reduce(train_i[features], axis=1)
train_i.gcd.unique()

In [None]:
plt.figure(figsize=(20,15))

for idx, gcd_val in enumerate(np.sort(train_i.gcd.unique())):
    plt.subplot(2,2,idx+1)
    selection = (train_i.gcd==gcd_val)
    plt.scatter(pca_features[:,0][selection],pca_features[:,1][selection], c=train.target_num[selection], cmap='tab10')
    plt.title(f'Principal plane 0-1, GCD={gcd_val}')
    #plt.xlim(-0.1,0.1)
    #plt.ylim(-0.1,0.1)
    plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,15))

for idx, gcd_val in enumerate(np.sort(train_i.gcd.unique())):
    plt.subplot(2,2,idx+1)
    selection = (train_i.gcd==gcd_val)
    plt.scatter(pca_features[:,0][selection],pca_features[:,2][selection], c=train.target_num[selection], cmap='tab10')
    plt.title(f'Principal plane 0-2, GCD={gcd_val}')
    #plt.xlim(-0.1,0.1)
    #plt.ylim(-0.1,0.1)
    plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,15))

for idx, gcd_val in enumerate(np.sort(train_i.gcd.unique())):
    plt.subplot(2,2,idx+1)
    selection = (train_i.gcd==gcd_val)
    plt.scatter(pca_features[:,1][selection],pca_features[:,2][selection], c=train.target_num[selection], cmap='tab10')
    plt.title(f'Principal plane 1-2, GCD={gcd_val}')
    #plt.xlim(-0.1,0.1)
    #plt.ylim(-0.1,0.1)
    plt.tight_layout()
plt.show()

# Dual 1 vs 4 with separation w.r.t. GCD

With help of GCD, we can observe an almost separation between the classes 1 and 4 as shown below. Only a little confusion for the subset with GCD=10000

In [None]:
plt.figure(figsize=(20,15))

for idx, gcd_val in enumerate(np.sort(train_i.gcd.unique())):
    plt.subplot(2,2,idx+1)
    selection = (train_i.gcd==gcd_val) & ((train.target_num == 1) | (train.target_num == 4))

    plt.scatter(pca_features[:,0][selection],pca_features[:,2][selection], c=train.target_num[selection], cmap='tab10')
    plt.title(f'Principal plane 0-2, GCD={gcd_val}')
    #plt.xlim(-0.1,0.1)
    #plt.ylim(-0.1,0.1)
    plt.tight_layout()
plt.show()

# Feature engineering

It might be a good idea to add GCD and some principal components to the feature lists.

In [None]:
train['gcd'] = train_i.gcd
train['pc0'] = pca_features[:,0]
train['pc1'] = pca_features[:,1]
train['pc2'] = pca_features[:,2]

del train_i, pca_features

features+=['gcd','pc0','pc1','pc2']

# **<span style='color:#A80808'>🚀 Model training</span>**

In [None]:
params = dict(n_estimators=3000,
              criterion='gini', 
              max_depth=None, 
              min_samples_split=10, 
              min_samples_leaf=1, 
              min_weight_fraction_leaf=0.0, 
              max_features=10, 
              max_leaf_nodes=None, 
              min_impurity_decrease=0.0, 
              bootstrap=False, 
              oob_score=False, 
              n_jobs=-1, 
              random_state=42, 
              verbose=0, 
              warm_start=False, 
              class_weight=None, 
              ccp_alpha=0.0, 
              max_samples=None
             )

Extra trees ensemble model is confirmed to be the most performance model for this compititon. More trees is better so we don't need CV here, we will just increase the number of trees until reaching the memory limit. We set max_depth and min_samples_split to simplify the tree development. If we consider default values for these parameters, the trees can freely developed.

In [None]:
model = et(**params)

model.fit(train[features], train.target_num)
#joblib.dump(model, f'et_all.pkl')

#del train

# **<span style='color:#A80808'>👌 Prediction</span>**

In [None]:
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id').astype('float32')
test.head()

In [None]:
test_pca = pca.transform(test)

In [None]:
test_i = pd.DataFrame({col: ((test[col] + bias_of(col)) * 1000000).round().astype(int)
                        for col in test.columns})

In [None]:
test['gcd'] = np.gcd.reduce(test_i, axis=1)
test['pc0'] = test_pca[:,0]
test['pc1'] = test_pca[:,1]
test['pc2'] = test_pca[:,2]

del test_pca, test_i

In [None]:
y_test = model.predict_proba(test)

In [None]:
# Post processing to rebalancing the classes
#credit: https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
#y_test += np.array([0, 0, 0.03, 0.036, 0, 0, 0, 0, 0, 0])

In [None]:
#https://www.kaggle.com/sfktrkl/tps-feb-2022
target_distribution = train['target'].value_counts().sort_index() / len(train) * 100
def get_diff(tune):
    y_pred_tuned = np.argmax(y_test + tune, axis=1)
    return target_distribution - pd.Series(y_pred_tuned).value_counts().sort_index() / len(test) * 100

tune = [0, 0, 0.03, 0.036, 0, 0, 0, 0, 0, 0]
diff = get_diff(tune)
while abs(diff).max() > 0.01:
    for i in range(len(diff)):
        if diff[i] > 0.01:
            tune[i] += 0.001
            break
        if diff[i] < -0.01:
            tune[i] -= 0.001
            break
    diff = get_diff(tune)

y_test += tune

# **<span style='color:#A80808'>🏆 Submission</span>**

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission.target = le.inverse_transform(np.argmax(y_test, axis=1))
pd.Series(submission.target).value_counts().sort_index() / len(test) * 100

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()

## Thank you for reading until here, dont forget to upvote 👍 if you like this notebook 