# Introduction

Hello Everyone :) Thanks for viewing this notebook. Kindly give your feedback if you like my work. Thank You 

**The task of FEB22 TPS competition is to classify 10 different bacteria species using data from a genomic analysis technique that has some data compression and data loss.**

# Importing Libraries

In [None]:
# Intel® Extension for Scikit-learn installation: (speeds up Kernel)
#!pip install scikit-learn-intelex

import os
import warnings

import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

%matplotlib inline

from tqdm import tqdm
from pathlib import Path

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

#from sklearnex import patch_sklearn
#patch_sklearn()


# Mute warnings
warnings.filterwarnings("ignore")

# Data Loading

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
display( train.head() )

In [None]:
display( test.head() )

# EDA

In [None]:
display(train.shape)

display(test.shape)

In [None]:
train.drop('row_id' , axis=1 , inplace=True)
test.drop('row_id' , axis=1 , inplace=True)

In [None]:
display(len(train.select_dtypes(include=np.number).columns.tolist()) )
display(len(test.select_dtypes(include=np.number).columns.tolist()) )

All Features are integers

In [None]:
display(train.isnull().values.any())
display(test.isnull().values.any())

No sign of missing data in both train and test dataset

**Target values and their count**

In [None]:
train.target.value_counts()

**Observations**
* Ten classes of bacterias
* Classes are balanced

**Dropping Duplicates if present**

In [None]:
display(train.duplicated().sum())

In [None]:
train.drop_duplicates(keep='first' , inplace=True)
train.shape

**Basic Description (Statistics) of Training Data**

In [None]:
train.describe()

In [None]:
train.describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='rocket') 
                     #.bar(subset=["max"], color='#F8766D')\
                     #.bar(subset=["mean",], color='#00BFC4')

In [None]:
features = [col for col in train.columns if col not in ['target']]
len(features)

# Feature Analysis / Visualizations

In [None]:
train[features].hist(bins=5 , alpha=0.5 , layout=(72,4) , log=True , figsize=(25,280))
plt.show()

**Analysis of Target**

In [None]:
plt.figure(figsize=(15,7))
ax = sns.countplot(x=train['target'] , data=train)
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
plt.show()

# Feature Engineering and Preprocessing

In [None]:
# train["mean"] = train[features].mean(axis=1)
# train["std"] = train[features].std(axis=1)
# train["min"] = train[features].min(axis=1)
# train["max"] = train[features].max(axis=1)

# test["mean"] = test[features].mean(axis=1)
# test["std"] = test[features].std(axis=1)
# test["min"] = test[features].min(axis=1)
# test["max"] = test[features].max(axis=1)

In [None]:
le = LabelEncoder()
train['target'] = le.fit_transform(train.target)
train.head()

**HELPER FUNCTION - REDUCE MEMORY USAGE** help taken from -> https://www.kaggle.com/remekkinas/super-learner-ensemble-extree-tuned-lda-umap/notebook#WORK-IN-PROGRESS-....

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

**Train/Test Split**

In [None]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(train.drop('target' , axis=1) , train.target , test_size=0.3 , random_state=42)

# Modelling

In [None]:
import lightgbm as lgb

dic_models = {'LightGBM' : lgb.LGBMClassifier() , 
              'RandomForest' : RandomForestClassifier() ,
              'ExtraTrees' : ExtraTreesClassifier()}

    
for i in dic_models:
    print('Training with ' + i + ' model. \n')
    
    model = dic_models[i].fit(X_train , y_train)
    
    #Predicting
    print('Predicting with ' + i + ' model. \n')
    pred = model.predict(X_test)
    
    # Using SMAPE for predicting models
    print("Accuracy of " + i + " Model is ", accuracy_score(y_test,pred))
    print("------------------------------------------------------------------")
    print()    

**Feature Importance (LGBM)**

In [None]:
lgb.plot_importance(dic_models['LightGBM'], max_num_features=40, figsize=(15, 15))
plt.show()

**Training on Complete Data**

In [None]:
X = train.drop('target',axis=1)
y = train['target']

model = ExtraTreesClassifier()

model.fit(X,y)

****

# Submission

**Predicting on Test Data**

In [None]:
final_preds = model.predict(test)

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
final_preds = le.inverse_transform(final_preds)

In [None]:
sub['target'] = final_preds
sub.head()


**Submitting**

In [None]:
sub.to_csv('submission.csv', index=False)