**Author**: _Pradip Kumar Das_

**License:** https://github.com/PradipKumarDas/Competitions/blob/main/LICENSE

**Profile & Contact:** [LinkedIn](https://www.linkedin.com/in/daspradipkumar/) | [GitHub](https://github.com/PradipKumarDas) | [Kaggle](https://www.kaggle.com/pradipkumardas) | pradipkumardas@hotmail.com (Email)

# Tabular Playground Series - Feb. 2022

**Feb 01, 2022 to Feb 28, 2022**

https://www.kaggle.com/c/tabular-playground-series-feb-2022/

_**Predicting bacteria species.**_

**Sections:**
- Dependencies
- Exploratory Data Analysis (EDA) & Preprocessing
- Modeling & Evaluation
- Submission

# Dependencies

In [1]:
# Loads required packages

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import lightgbm as lgb

# Exploratory Data Analysis (EDA) & Preprocessing

In [3]:
# Loads train dataset
train = pd.read_csv("./data/train.csv")

In [4]:
# Checks how the train data set looks
display(train.head())

Unnamed: 0,row_id,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,...,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0,target
0,0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Streptococcus_pyogenes
1,1,-9.536743e-07,-1e-05,-4.3e-05,0.000886,-0.0002,0.00076,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,0.000914,0.000914,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Salmonella_enterica
2,2,-9.536743e-07,-2e-06,7e-06,0.000129,0.000268,0.00027,0.000243,0.000125,1e-06,...,8.4e-05,4.8e-05,8.1e-05,0.000106,7.2e-05,1e-05,8e-06,1.9e-05,1.046326e-06,Salmonella_enterica
3,3,4.632568e-08,-6e-06,1.2e-05,0.000245,0.000492,0.000522,0.000396,0.000197,-3e-06,...,0.000151,0.0001,0.00018,0.000202,0.000153,2.1e-05,1.5e-05,4.6e-05,-9.536743e-07,Salmonella_enterica
4,4,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Enterococcus_hirae


In [5]:
# Reindexes train data set with 'row_id'
train.set_index(["row_id"], inplace=True)

In [6]:
# Shows the summary of the train data set
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Columns: 287 entries, A0T0G0C10 to target
dtypes: float64(286), object(1)
memory usage: 439.5+ MB


In [7]:
# Checks for duplicates
print(f"Duplicates in train data: {train.duplicated().sum()}")

Duplicates in train data: 76007


In [8]:
# Removing duplicates
train.drop_duplicates(keep="first", inplace=True)

In [9]:
train.reset_index(drop=True, inplace=True)

In [10]:
# Checks for any missing values
train.isna().any().any()

False

In [11]:
# As all classifier algorithms are not scale variant, let's see if any of the
# columns having

train_min_max = train.describe().transpose()[["min", "max"]]

train_min_max.columns = ["minimum", "maximum"]

train_min_max.query('`minimum` < -1.0 or `maximum` > 1.0').shape[0]

0

Value 0 confirms that values of all columns ranges from -1 to +1 

In [12]:
del train_min_max

In [13]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        # else:
        #     df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [14]:
# Compresses the train data set
train = reduce_mem_usage(train.copy())

Mem. usage decreased to 68.58 Mb (74.7% reduction)


In [15]:
# Shows the summary of the train data set post compression
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123993 entries, 0 to 123992
Columns: 287 entries, A0T0G0C10 to target
dtypes: float16(286), object(1)
memory usage: 68.6+ MB


In [16]:
# Loads test data
test = pd.read_csv("./data/test.csv")

In [17]:
# Sets "row_id" as index
test.set_index(["row_id"], inplace=True)

In [18]:
# Shows the summary of the test data set
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 200000 to 299999
Columns: 286 entries, A0T0G0C10 to A10T0G0C0
dtypes: float64(286)
memory usage: 219.0 MB


In [19]:
# Compresses the test data set
test = reduce_mem_usage(test.copy())

Mem. usage decreased to 55.31 Mb (74.7% reduction)


In [20]:
# Shows the summary of the test data set post compression
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 200000 to 299999
Columns: 286 entries, A0T0G0C10 to A10T0G0C0
dtypes: float16(286)
memory usage: 55.3 MB


In [21]:
# Checks for duplicate in data set by finding any rows common to both train and test data set
print(
    "Number of duplicate rows between test and test dataset:", 
    len(test.merge(train, on=list(train.select_dtypes(["float16"]).columns), how="inner")))

Number of duplicate rows between test and test dataset: 486


In [22]:
# Shows label distribution as number of samples in each class and percentile of each class
display(pd.DataFrame(
    {"No. of Samples": train.target.value_counts(), 
     "Class Percentile": train.target.value_counts()/len(train)*100}))

Unnamed: 0,No. of Samples,Class Percentile
Bacteroides_fragilis,12522,10.098957
Campylobacter_jejuni,12469,10.056213
Klebsiella_pneumoniae,12420,10.016694
Streptococcus_pneumoniae,12416,10.013469
Staphylococcus_aureus,12415,10.012662
Streptococcus_pyogenes,12406,10.005404
Salmonella_enterica,12390,9.9925
Enterococcus_hirae,12373,9.978789
Escherichia_coli,12297,9.917495
Escherichia_fergusonii,12285,9.907817


The above distribution for label indicates that the dataset is mostly balanced.

# Modeling & Evaluation

In [23]:
# Creates a DataFrame for features intersection between train and test data set

intersection = test.copy()
intersection["row_index"] = intersection.index
intersection = intersection.merge(train, on=list(train.select_dtypes(["float16"]).columns), how="inner")
intersection.set_index(["row_index"], inplace=True)

In [25]:
# Encodes the labels

label_encoder = LabelEncoder()
train.target = label_encoder.fit_transform(train.target)

In [26]:
# Seperates label from features

X = train.select_dtypes(["float16"])
y = train.target

In [29]:
# Encodes the labels in intersection data set, too
intersection.target = label_encoder.transform(intersection.target)

In [34]:
# Keeps only label agaisnt index
intersection = intersection.target

In [32]:
# Creates a stratified splitter
k_folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

## Decision Tree

In [65]:
cv_accuracy = []

In [68]:
# Performs modeling with Decision Tree with defaul parameters

cv_accuracy.clear()
for i, (train_idx, test_idx) in enumerate(k_folds.split(X, y)):
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X=X.loc[train_idx], y=y.loc[train_idx])
    predictions = clf.predict(X=X.loc[test_idx])
    accuracy = accuracy_score(y.loc[test_idx], predictions)
    cv_accuracy.append(accuracy)
    print(f"FOLD: {i}, Accuracy: {accuracy}")
print(f"Mean Accuracy: {np.mean(cv_accuracy)}")

FOLD: 0, Accuracy: 0.8981451612903226
FOLD: 1, Accuracy: 0.9019354838709678
FOLD: 2, Accuracy: 0.9025
FOLD: 3, Accuracy: 0.8948302282442132
FOLD: 4, Accuracy: 0.8975723848697476
FOLD: 5, Accuracy: 0.8989434631825147
FOLD: 6, Accuracy: 0.8991047665134285
FOLD: 7, Accuracy: 0.9033793047826437
FOLD: 8, Accuracy: 0.8977336882006614
FOLD: 9, Accuracy: 0.9066053714009195
Mean Accuracy: 0.9000749852355419


## Random Forests

In [76]:
# Performs modeling with Random Forests with defaul parameters

cv_accuracy.clear()
for i, (train_idx, test_idx) in enumerate(k_folds.split(X, y)):
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
    clf.fit(X=X.loc[train_idx], y=y.loc[train_idx])
    predictions = clf.predict(X=X.loc[test_idx])
    accuracy = accuracy_score(y.loc[test_idx], predictions)
    cv_accuracy.append(accuracy)
    print(f"FOLD: {i}, Accuracy: {accuracy}")
print(f"Mean Accuracy: {np.mean(cv_accuracy)}")

FOLD: 0, Accuracy: 0.9706451612903226
FOLD: 1, Accuracy: 0.967741935483871
FOLD: 2, Accuracy: 0.9672580645161291
FOLD: 3, Accuracy: 0.9674973788208726
FOLD: 4, Accuracy: 0.9696749737882088
FOLD: 5, Accuracy: 0.9708847487700621
FOLD: 6, Accuracy: 0.9706427937736914
FOLD: 7, Accuracy: 0.9720945237519155
FOLD: 8, Accuracy: 0.9699975804500363
FOLD: 9, Accuracy: 0.9712073554318896
Mean Accuracy: 0.9697644516076999


## Extra Trees

In [79]:
# Performs modeling with Random Forests with defaul parameters

cv_accuracy.clear()
for i, (train_idx, test_idx) in enumerate(k_folds.split(X, y)):
    clf = ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=42)
    clf.fit(X=X.loc[train_idx], y=y.loc[train_idx])
    predictions = clf.predict(X=X.loc[test_idx])
    accuracy = accuracy_score(y.loc[test_idx], predictions)
    cv_accuracy.append(accuracy)
    print(f"FOLD: {i}, Accuracy: {accuracy}")

FOLD: 0, Accuracy: 0.9716129032258064
FOLD: 1, Accuracy: 0.9689516129032258
FOLD: 2, Accuracy: 0.9682258064516129
FOLD: 3, Accuracy: 0.970239535446407
FOLD: 4, Accuracy: 0.9700782321154932
FOLD: 5, Accuracy: 0.9742721187192516
FOLD: 6, Accuracy: 0.9712880070973465
FOLD: 7, Accuracy: 0.9725784337446568
FOLD: 8, Accuracy: 0.969513670457295
FOLD: 9, Accuracy: 0.9718525687555448


In [80]:
print(f"Mean Accuracy: {np.mean(cv_accuracy)}")

Mean Accuracy: 0.9708612888916639


## LightGBM

In [82]:
# Performs modeling with LightGBM with defaul parameters

cv_accuracy.clear()
for i, (train_idx, test_idx) in enumerate(k_folds.split(X, y)):
    clf = lgb.LGBMClassifier(n_estimators=100, objective="multiclass", n_jobs=-1, random_state=42)
    clf.fit(X.loc[train_idx], y=y.loc[train_idx], eval_metric="multi_logloss")
    predictions = clf.predict(X=X.loc[test_idx])
    accuracy = accuracy_score(y.loc[test_idx], predictions)
    cv_accuracy.append(accuracy)
    print(f"FOLD: {i}, Accuracy: {accuracy}")
print(f"Mean Accuracy: {np.mean(cv_accuracy)}")

FOLD: 0, Accuracy: 0.967983870967742
FOLD: 1, Accuracy: 0.967741935483871
FOLD: 2, Accuracy: 0.9687096774193549
FOLD: 3, Accuracy: 0.967255423824502
FOLD: 4, Accuracy: 0.9682232438099847
FOLD: 5, Accuracy: 0.9710460521009758
FOLD: 6, Accuracy: 0.9708847487700621
FOLD: 7, Accuracy: 0.9699975804500363
FOLD: 8, Accuracy: 0.967981288813614
FOLD: 9, Accuracy: 0.9717719170900879
Mean Accuracy: 0.9691595738730232


## Stacking

Let's apply stacked generalization with models Random Forests, Extra Trees and LightGBM as these all performed well on cross validation.

In [193]:
# Traines the first level models and stores predictions (probabilities) of out of fold data set

cv_predictions_proba = []
cv_labels = []
cv_avg_accuracy = []
test_predictions_proba = []

for i, (train_idx, test_idx) in enumerate(k_folds.split(X, y)):
    clfs = [
        ("Random Forests", RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)), 
        ("Extra Trees", ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)), 
        ("LightGBM", lgb.LGBMClassifier(n_estimators=300, objective="multiclass", n_jobs=-1, random_state=42))
    ]
    print(f"FOLD: {i}")
    for clf in clfs:
        print(f"\t{clf[0]}: Training...", end="")
        clf[1].fit(X=X.loc[train_idx], y=y.loc[train_idx])
        print("done. OOF accuracy...", end="")
        predictions = clf[1].predict(X=X.loc[test_idx])
        cv_predictions_proba.append(clf[1].predict_proba(X=X.loc[test_idx]))
        cv_labels.append(y.loc[test_idx])
        accuracy = accuracy_score(y.loc[test_idx], predictions)
        cv_avg_accuracy.append(accuracy)
        print(f"{accuracy}.", end=" ")
        print(f"Predicting on test...", end="")
        test_predictions_proba.append(clf[1].predict_proba(X=test))
        print("done.\n")

print(f"Average cross validation accuracy: {np.mean(cv_avg_accuracy)}")

FOLD: 0
	Random Forests: Training...done. OOF accuracy...0.9749193548387097. Predicting on test...done.

	Extra Trees: Training...done. OOF accuracy...0.9759677419354839. Predicting on test...done.

	LightGBM: Training...done. OOF accuracy...0.9766935483870968. Predicting on test...done.

FOLD: 1
	Random Forests: Training...done. OOF accuracy...0.9718548387096774. Predicting on test...done.

	Extra Trees: Training...done. OOF accuracy...0.9735483870967742. Predicting on test...done.

	LightGBM: Training...done. OOF accuracy...0.9733870967741935. Predicting on test...done.

FOLD: 2
	Random Forests: Training...done. OOF accuracy...0.9721774193548387. Predicting on test...done.

	Extra Trees: Training...done. OOF accuracy...0.974516129032258. Predicting on test...done.

	LightGBM: Training...done. OOF accuracy...0.9743548387096774. Predicting on test...done.

FOLD: 3
	Random Forests: Training...done. OOF accuracy...0.9734656020646827. Predicting on test...done.

	Extra Trees: Training...d

In [194]:
# Concatenates all out of fold predicted label representations (probabilities) and
# actual label into DataFrame and Series, respectively

cv_predictions_proba = pd.DataFrame(np.concatenate(cv_predictions_proba))
cv_labels = pd.Series(np.concatenate(cv_labels), name="target")

In [195]:
# Fits second level classifier

second_level_clf = LogisticRegression(random_state=42, max_iter=200, n_jobs=-1)
second_level_clf.fit(cv_predictions_proba, cv_labels)

LogisticRegression(max_iter=200, n_jobs=-1, random_state=42)

In [214]:
# List to store test predictions
test_predictions = []

In [215]:
# Second level classifier predicts over prediction probabilities
# appends the predictions into list
for predictions_proba in test_predictions_proba:
    test_predictions.append(second_level_clf.predict(predictions_proba))

In [216]:
# Transposes for rows to indicate test samples and columns as predictions
test_predictions = pd.DataFrame(test_predictions).transpose()

In [217]:
# Set index of the prediction DataFrame to match that of test DataFrame
test_predictions.set_index(np.arange(200000, 300000), inplace=True)

In [218]:
# Takes those predictions that appear most often
test_predictions = test_predictions.mode(axis=1, numeric_only=True)[0].astype("int")

In [219]:
# Update predictions with ones from train data set intersecting with test data set
test_predictions.loc[intersection.index] = intersection

# Submission

In [226]:
# Saves predictions into file for submission

submission = pd.read_csv("./data/sample_submission.csv")
submission.target = label_encoder.inverse_transform(test_predictions)
submission.to_csv("./submission.csv", index=False)

In [228]:
# Checks how the submission looks
display(submission.head())

Unnamed: 0,row_id,target
0,200000,Escherichia_fergusonii
1,200001,Salmonella_enterica
2,200002,Enterococcus_hirae
3,200003,Salmonella_enterica
4,200004,Staphylococcus_aureus


_**Leaderboard score against this submission was 0.97274 and highest score as on Feb 12, 2022 was 0.99006**_