# Bacteria Classification 🦠🦠🦠

# Challenge Description
For the February 2022 Tabular Playground Series competition, your task is to classify 10 different bacteria species using data from a genomic analysis technique that has some data compression and data loss. In this technique, decamer snippets of DNA are sampled and analyzed to give the histogram of base count. This means that ATATGGCCTT will turn into A2T4G2C2 column in dataframe.

# Importing libraries 📚

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
#os.chdir('../input/tabular-playground-series-feb-2022')

In [None]:
# reduce memory usage of dataframe
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## EDA 📈

In [None]:
# read train dataframe
train = reduce_mem_usage(pd.read_csv('train.csv', index_col='row_id'))
train.head()
# 287 features

In [None]:
train.describe()

In [None]:
# correlated features
plt.figure(figsize=(10,10))
sns.heatmap(train[train.columns[train.columns !='target']].corr())

In [None]:
# identify columns with corr > 0.8
cor_matrix = train.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
upper_tri[upper_tri >= 0.8][upper_tri != 1].unstack().dropna().to_dict()
# 'A3T0G3C4', 'A4T0G1C5', 'A1T2G7C0'

In [None]:
# read test dataframe
test = reduce_mem_usage(pd.read_csv('test.csv', index_col = 'row_id'))
test.head()

In [None]:
# look for categorical features unique values<25
temp = pd.concat([train,test],axis = 0)
cat_features = [x for x in temp.columns[temp.columns != 'target'] if len(pd.unique(train[x])) < 25 ]
cat_features

In [None]:
# numerical features
num_features = train.columns[~train.columns.isin(cat_features + ['target'])]

# proportion of cat vs num features
print('% of numeric features: '+str(len(num_features)/ (train.shape[1]-1)*100))
print('% of categorical features: '+str(len(cat_features)/ (train.shape[1]-1)*100))

In [None]:
# drop correlated feaures
train.drop(columns = ['A3T0G3C4', 'A4T0G1C5', 'A1T2G7C0'], inplace = True)
test.drop(columns = ['A3T0G3C4', 'A4T0G1C5', 'A1T2G7C0'], inplace = True)
#train.to_csv('train_cleaned.datacsv')
len(train.columns[train.columns!='target'])
len(test.columns[test.columns!='target'])

In [None]:
# remove dupplicates and get sample weight: times the column was dupplicated
train_not_duplicates = pd.DataFrame([list(tup) for tup in train.value_counts().index.values], columns=train.columns,)
train_not_duplicates['sample_weight'] = train.value_counts().values
print(train_not_duplicates.shape)
print(train.shape)

In [None]:
# concatenate train and test dataset
# test row_id >= 200000
data = pd.concat([train_not_duplicates,test], axis = 0)
print(data.shape)
for col in data.columns[0:10]:
  print(str(col)+': '+str(len(data[col].value_counts().index)))
# 223993 rows, but few unique values in columns, could be categorical features or
#just transformation of numbers, 
#let's see the values of a random feature
# feature : unique values

In [None]:
list(data['A0T0G4C6'].value_counts().index)[:30]
#  many values end in the same decimals, this is the sign of a transformation applied to the data 
# -0.0001902716064453
# -0.0001302716064453
# -0.0001402716064453,
# -0.0001202716064453,
# -0.0001802716064453,
# -0.0001702716064453
#     it turned out, i should've read the paper from the competition, 
#     they applied a transformation, now we have to de-transform the data

using [this](https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense/notebook) approach, we'll detransform the data, this will help to see better relationships among columns and create new features.

In [None]:
# de transform data
# A=>w, T=>x, G=>y, C=>z
from math import factorial

def bias(col):
  w = int(col[col.index('A')+1 : col.index('T')])
  x = int(col[col.index('T')+1 : col.index('G')])
  y = int(col[col.index('G')+1 : col.index('C')])
  z = int(col[col.index('C')+1 :])
  k = w + x + y + z
  # formula: (1/4^k)* k!/(w!*x!*y!*z!)
  return (1/4**k)*factorial(k)/(factorial(w)*factorial(x)*factorial(y)*factorial(z))

In [None]:
# basically the current data is init_data - bias, so we have to sum the bias and multiply by 1000000
for col in data.columns[~data.columns.isin(['target','sample_weight'])]:
  data[col] = round((data[col] + bias(col))*1000000)
data.head() 


In [None]:
#get the gcd from all adn bacterias in the dataset, this allow us to know how much decamers it had in the lecture
data['gcd'] = np.gcd.reduce(data[data.columns[~data.columns.isin(['target','sample_weight'])]].astype(int).values, axis=1)

In [None]:
# save the data
#data.to_csv('cleaned_data.csv')

In [None]:
# target balance in trainset
plt.pie(train.target.value_counts().tolist(), labels = train.target.value_counts().index.tolist(),autopct='%1.1f%%')

# Models

## Extra trees

In [None]:
# upload the dataframe with transformations
#data = reduce_mem_usage(pd.read_csv('cleaned_data.csv', index_col='Unnamed: 0'))
#data.head()

In [None]:
# replace target with int, should've used label encoding
target_id = {
    'Bacteroides_fragilis'   : 0,
    'Streptococcus_pyogenes' : 1,
    'Streptococcus_pneumoniae': 2,
    'Campylobacter_jejuni'   : 3,
    'Salmonella_enterica'    : 4,
    'Escherichia_coli'       : 5,
    'Enterococcus_hirae'     : 6,
    'Escherichia_fergusonii' : 7,
    'Staphylococcus_aureus'  : 8,
    'Klebsiella_pneumoniae'  :9
}
data.target.replace(target_id, inplace = True)

using extratrees with CV inspired by [this](https://www.kaggle.com/maxencefzr/tps-feb22-eda-extratrees#notebook-container) notebook 

In [None]:
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

SEED = 42
N_SPLITS = 10

folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

train = data.loc[data.index <= 123992]
test = data.loc[data.index > 123992]

sample_weight = train['sample_weight']

y_pred, y_prob, scores = [], [], []

features = train.columns[~train.columns.isin(['sample_weight', 'target'])]

X = train[features]
y = train['target']
for fold, (train_id, valid_id) in enumerate(tqdm(folds.split(X,y), total=N_SPLITS)):
    
    # Splitting (w/ sample weights)
    X_train, y_train, sample_weight_train = X.iloc[train_id], y.iloc[train_id], sample_weight.iloc[train_id]
    X_valid, y_valid, sample_weight_valid = X.iloc[valid_id], y.iloc[valid_id], sample_weight.iloc[valid_id]
    
    # Model with params
    params = {
        'n_estimators': 300,
    }
    
    exTree = ExtraTreesClassifier(
        **params,
        n_jobs=-1,
        random_state=SEED
    )

    # Training (w/ sample weights)
    exTree.fit(X_train, y_train, sample_weight_train)
    
    # Evaluation
    valid_pred = exTree.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)
    
    print(f'### \033[1;31;43m Fold: {fold} \033[0;0m')
    print(f'Accuracy score: {valid_score:6f} \n')
    
    scores.append(valid_score)
    
    # Prediction for submission
    y_pred.append(exTree.predict(data.loc[data.index > 123992, data.columns[~data.columns.isin(['target','sample_weight'])]]))
    y_prob.append(exTree.predict_proba(data.loc[data.index > 123992, data.columns[~data.columns.isin(['target','sample_weight'])]]))


In [None]:
print(np.mean(scores))

In [None]:
import plotly.express as px
import plotly.graph_objects as go

df_feature_imp = pd.DataFrame({
    'feature': X.columns, 
    'importance': exTree.feature_importances_
})

feature_imp_25 = df_feature_imp.sort_values(
    by='importance', ascending=False
).iloc[:25].reset_index(drop=True)

fig = go.Figure(
    go.Bar(
        x=feature_imp_25.importance,
        y=feature_imp_25.feature,
        orientation='h',
        marker=dict(color=feature_imp_25.importance)
    )
)

fig.update_layout(
    title_text='Feature importance',
    xaxis_title_text='Importance',
    yaxis_title_text='Features',
    height=1000,
    yaxis=dict(autorange='reversed')
)
fig.show()

In [None]:
# predictions for extra trees alone
# from scipy.stats import mode
# y_pred_f = mode(y_pred).mode[0]
# y_pred_f = pd.DataFrame(y_pred_f, columns = ['target'])

# y_pred_f.target.replace({y:x for (x,y) in zip(list(target_id.keys()),list(target_id.values()))}, inplace = True)

# sub = pd.read_csv('sample_submission.csv', index_col = 'row_id')
# sub['target'] = y_pred_f.values
# sub.head()

#subbmission of extra tree model
#sub.to_csv('submission_extratrees.csv')

## KNN 

In [None]:
# PCA: KNN needs a short amount of features in order to perform properly, thus, 
#we'll be using dimensionality reduction
# scale the data!!!!
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
data_rescaled = min_max_scaler.fit_transform(X)

from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
pca_train = pca.fit_transform(data_rescaled)
pca.get_params()

In [None]:
#train knn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

x_pca_train,x_pca_test,y_train,y_test = train_test_split(pca_train,y,test_size=0.33,random_state=42)

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_pca_train,y_train)

print('Training accuracy {:.4f}'.format(knn.score(x_pca_train,y_train)))
print('Testing accuracy {:.4f}'.format(knn.score(x_pca_test,y_test)))

In [None]:
#fit in all dataset to make predictions
knn.fit(pca_train,y)

## Neural net 

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense, Activation, BatchNormalization
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import RobustScaler

# scale before feeding the neural net
RobustScale = RobustScaler().fit(train[features].values)

X_rob_train = RobustScale.transform(train[features].values)
X_rob_test = RobustScale.transform(test[features].values)

# separate train and validation data
X_train,X_valid,y_train,y_valid = train_test_split(pd.concat([pd.DataFrame(X_rob_train, columns = features),
                                                 train['sample_weight']], axis = 1),
                                                to_categorical(train['target'], dtype="int64"),
                                               test_size=0.33,random_state=42)

In [None]:
# nNet architecture
nNet = Sequential(
    [
        Dense(256, input_shape=(len(features),),kernel_initializer= 'he_normal'),
        BatchNormalization(),
        Activation(activation="relu"),
     
        Dense(256,kernel_initializer= 'he_normal'),
        BatchNormalization(),
        Activation(activation="relu"),
        Dropout(0.3),
        
        Dense(256,kernel_initializer= 'he_normal'),
        BatchNormalization(),
        Activation(activation="relu"),
        Dropout(0.3),

        # Dense(256,kernel_initializer= 'he_normal'),
        # BatchNormalization(),
        # Activation(activation="relu"),
        # Dropout(0.3),
     
        # Dense(256,kernel_initializer= 'he_normal'),
        # BatchNormalization(),
        # Activation(activation="relu"),
        # Dropout(0.3),

        Dense(10, activation="softmax"),
    ]
)
nNet.summary()

In [None]:
# compile Nnet
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(
    patience=10, 
    min_delta=0.001, 
    restore_best_weights=True)
nNet.compile(loss="categorical_crossentropy",optimizer='Adam', metrics = ['accuracy'])

In [None]:
# train Nnet
sample_weights = X_train['sample_weight'].astype('int')
history = nNet.fit(X_train[features], y_train, epochs = 200, batch_size = 128,
          sample_weight= sample_weights,
          validation_data = (X_valid[features], y_valid),
          callbacks=[early_stopping] )

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# not very pretty, maybe too little epochs

In [None]:
# nNet scores
score_val = nNet.evaluate(X_valid[features],y_valid)
score_train = nNet.evaluate(X_train[features],y_train)
print('val score: '+str(score_val))
print('train score: '+str(score_train))

In [None]:
# train Nnet with all data
sample_weights = X_train['sample_weight'].astype('int')
history = nNet.fit(RobustScale.transform(train[features]), to_categorical(train['target'], dtype="int64"), epochs = 200, batch_size = 128,
          sample_weight= train['sample_weight'],
          callbacks=[early_stopping] )

In [None]:
# nNet submission alone

# y_pred = np.argmax(nNet.predict(test[features]), axis = 1)
# y_pred = pd.DataFrame(y_pred, columns = ['target'])
# y_pred.target.replace({y:x for (x,y) in zip(list(target_id.keys()),list(target_id.values()))}, inplace = True)

# sub = pd.read_csv('sample_submission.csv', index_col = 'row_id')
# sub['target'] = y_pred.values
# sub.head()

#sub.to_csv('submission_neuralNet.csv')

## light GBM

In [None]:
# i'm ussing cv to train the lgbm

from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

SEED = 42
N_SPLITS = 10

folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# sample_weight = data.loc[~data['sample_weight'].isnull(),'sample_weight']

y_pred, y_prob, scores = [], [], []

for fold, (train_id, valid_id) in enumerate(tqdm(folds.split(X,y), total=N_SPLITS)):
    
    # Splitting (w/ sample weights)
    X_train, y_train = X.iloc[train_id], y.iloc[train_id]#, sample_weight.iloc[train_id]
    X_valid, y_valid  = X.iloc[valid_id], y.iloc[valid_id]#, sample_weight.iloc[valid_id]
    
    # Model with params
    params = {
            'objective': 'multiclass',  
            'random_state': 48,
            'n_estimators':8000, 
            'device' : 'gpu',
            'learning_rate': 0.1,
            'max_depth': 120,
            'num_leaves': 40,
            'max_bin' : 180 ,
            "device" : "gpu",
            'extra_trees' : True
    }
    
    lgb = LGBMClassifier(
        **params,
    )

    # Training (w/ sample weights)
    lgb.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],early_stopping_rounds=15,verbose=0)
    
    # Evaluation
    valid_pred = lgb.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print(f'### \033[1;31;43m Fold: {fold} \033[0;0m')
    print(f'Accuracy score: {valid_score:6f} \n')
    
    scores.append(valid_score)
    
    # Prediction for submission
    y_pred.append(lgb.predict(data.loc[data.index > 123992, data.columns[~data.columns.isin(['target','sample_weight'])]]))
    y_prob.append(lgb.predict_proba(data.loc[data.index > 123992, data.columns[~data.columns.isin(['target','sample_weight'])]]))


# Ensemble

In order to make the ensemble, a dataset (stacked pred) made out from the predictions of tme model on the validation dataset will be used to train the meta-model. After this, we'll predict on the predictions of the four models in the test dataset

In [None]:
# valdidation predictions
val_pred_Trees = exTree.predict_proba(X_valid[features])
val_pred_Knn = knn.predict_proba(pca.transform(min_max_scaler.transform(X_valid[features])))
val_pred_NN = nNet.predict(X_valid[features])
val_pred_lgb = lgb.predict_proba(X_valid[features])

# test predictions
test_pred_Trees = exTree.predict_proba(test[features])
test_pred_Knn = knn.predict_proba(pca.transform(min_max_scaler.transform(test[features])))
test_pred_NN = nNet.predict(test[features])
test_pred_lgb = lgb.predict_proba(test[features])

In [None]:
# stack predictions
stacked_pred = np.concatenate((val_pred_Trees,val_pred_Knn,val_pred_lgb), axis=1)
#val_pred_NN,
stacked_pred_test = np.concatenate((test_pred_Trees,test_pred_Knn,test_pred_lgb), axis=1)
#test_pred_NN,

In [None]:
# meta model training
from sklearn.ensemble import ExtraTreesClassifier

meta_model = ExtraTreesClassifier(n_estimators=10, random_state=0)
#(max_depth=8,n_estimators=200,min_samples_split = 10,max_features = 30)
meta_model.fit(stacked_pred, y_valid)
meta_model.score(stacked_pred, y_valid)

In [None]:
# submission file
y_preds = meta_model.predict(stacked_pred_test)
y_pred_f = pd.DataFrame(y_preds, columns = ['target'])
y_pred_f.target.replace({y:x for (x,y) in zip(list(target_id.keys()),list(target_id.values()))}, inplace = True)

sub = pd.read_csv('sample_submission.csv', index_col = 'row_id')
sub['target'] = y_pred_f.values
sub.head()

In [None]:
# export submission file
os.chdir('/kaggle/working')
sub.to_csv('submission_ensembleTree_nn.csv')