# Introduction

Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter("ignore")
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

display(train.head())
display(sub.head())

In [None]:
display(train.shape)
display(test.shape)

In [None]:
train.drop('row_id', axis=1, inplace=True)
numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
object_cols = list(set(train.columns) - set(numeric_cols))
print('numeric cols len: ', len(numeric_cols))
print('object col: ', object_cols)

In [None]:
for col in numeric_cols:
    if len(train[col].unique()) < 10:
        print(train[col].unique())

In [None]:
train.isna().sum().sum()

In [None]:
duplicates_train = train.duplicated().sum()
duplicates_train

In [None]:
train.drop_duplicates(keep='first', inplace=True)
train.shape

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(24, 8))
sns.countplot(train['target'], ax=ax);

In [None]:
from scipy.stats import skew, boxcox

skew_cols = []
for col in numeric_cols:
    skew_value = skew(train[col])
    if abs(skew_value) > 50:
        skew_cols.append(col)

ncols = 2
nrows = round(len(skew_cols) / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(24, 12))
plt.subplots_adjust(hspace=1)

index = 0
for row in range(nrows):
    for col in range(ncols):
        try:
            col_name = skew_cols[index]
        except:
            axes[row][col].set_visible(False)
            index += 1
            continue
        
        sns.histplot(train[col_name], ax=axes[row][col], bins=np.linspace(train[col_name].min(), train[col_name].max(), 201))
        axes[row][col].set_title(col_name);
        index += 1

In [None]:
import umap

embedding = umap.UMAP(n_neighbors=10,
                      min_dist=0.3,
                      metric='correlation').fit_transform(train.drop(['target'], axis=1))

In [None]:
colors = pd.factorize(train.loc[:, 'target'])
colors_dict = {
        0: 'Streptococcus_pyogenes',
        1: 'Salmonella_enterica',
        2: 'Enterococcus_hirae',
        3: 'Escherichia_coli',
        4: 'Campylobacter_jejuni',
        5: 'Streptococcus_pneumoniae',
        6: 'Staphylococcus_aureus',
        7: 'Escherichia_fergusonii',
        8: 'Bacteroides_fragilis',
        9: 'Klebsiella_pneumoniae'
}
color_list = sns.color_palette(None, 10)

fig, ax = plt.subplots(figsize=(12,12))
for color_key in colors_dict.keys():
    indexs = colors[0] == color_key
    temp_embedding = embedding[indexs, :]
    ax.scatter(temp_embedding[:, 0], temp_embedding[:, 1], 
                c=color_list[color_key], 
                edgecolor='none', 
                alpha=0.80,
                label=colors_dict[color_key],
                s=10)
plt.legend(bbox_to_anchor=(1, 1), fontsize="x-large", markerscale=2.)
plt.title('UMAP', fontsize=18);

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=2000, random_state=42)

In [None]:
tsne_data = tsne.fit_transform(train.drop(['target'], axis=1))

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
for color_key in colors_dict.keys():
    indexs = colors[0] == color_key
    temp_tsne = tsne_data[indexs, :]
    ax.scatter(temp_tsne[:, 0], temp_tsne[:, 1], 
                c=color_list[color_key], 
                edgecolor='none', 
                alpha=0.80,
                label=colors_dict[color_key],
                s=10)
plt.legend(bbox_to_anchor=(1, 1), fontsize="x-large", markerscale=2.)
plt.title('T-SNE', fontsize=18);

# P-Values

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
targets =  pd.DataFrame(enc.fit_transform(train[['target']]))
targets.columns = list(enc.categories_[0])
targets.head()

In [None]:
from scipy.stats import pearsonr

p_values_target_list = []
for target in targets.columns:
    p_values_list = []
    for c in numeric_cols:
        p = round(pearsonr(targets.loc[:,target], train.loc[:,c])[1], 4)
        p_values_list.append(p)
    p_values_target_list.append(p_values_list)
p_values_target_list = np.array(p_values_target_list)
p_values_target_list = p_values_target_list.reshape(286, 10)
p_values_df = pd.DataFrame(p_values_target_list, columns=list(enc.categories_[0]), index=numeric_cols)

def p_value_warning_background(cell_value):
    highlight = 'background-color: lightcoral;'
    default = ''
    if cell_value > 0.05:
            return highlight
    return default

p_values_df.style.applymap(p_value_warning_background)

In [None]:
important_dict = {}
for index, col in enumerate(p_values_df.columns):
    important_dict[col] = list(p_values_df.loc[p_values_df[col].values > 0.05, col].index)

# Correlations

In [None]:
import gc
gc.collect()

all_feature_corr = train[numeric_cols].corr()
upper = all_feature_corr.where(np.triu(np.ones(all_feature_corr.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
to_drop

In [None]:
targets_train = targets.merge(train.drop('target', axis=1), left_index=True, right_index=True)
all_target_corr = targets_train.corr()

In [None]:
nrows = 10
low_corr = []
th = 0.0001
for i in range(nrows):
    low_cor_values = all_target_corr.iloc[i, :].abs()[all_target_corr.iloc[i, :].abs() < th]
    low_corr.append(low_cor_values)

In [None]:
ignore_dict = {}
fig, axes = plt.subplots(nrows, 1, figsize=(24, 12))
plt.subplots_adjust(hspace=1)
for i in range(nrows):
    sns.heatmap(low_corr[i:i+1], annot=True, vmin=0, vmax=th, ax=axes[i])
    axes[i].set_xticklabels(list(low_corr[i].index))
    axes[i].set_ylabel(targets.columns[i], rotation=0);
    ignore_dict[targets.columns[i]] = list(low_corr[i].index)

# Ignore Cols

In [None]:
print(ignore_dict)

# Important Cols

In [None]:
print(important_dict)

# Modeling

In [None]:
from termcolor import colored
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

for index, col_key in enumerate(targets.columns):
    y = targets[col_key]
    X_col = list(set(numeric_cols) - set(ignore_dict[col_key]))
    
    clf = GaussianNB()
    base_score = cross_val_score(clf, train[numeric_cols], y, cv=3, scoring='balanced_accuracy').mean()
    new_score = cross_val_score(clf, train[X_col], y, cv=3, scoring='balanced_accuracy').mean()
    
    if new_score > base_score:
        print(colored(col_key, 'green'), '- base accuracy score:', base_score, '| new accuracy score:', new_score)
    elif base_score > new_score:
        print(colored(col_key, 'red'), '- base accuracy score:', base_score, '| new accuracy score:', new_score)
    else:
        print(colored(col_key, 'grey'), '- base accuracy score:', base_score, '| new accuracy score:', new_score)

# Important Note

Don't forget, the corr function finds only a linear relationship. Deleting columns according to corr score is not true. In this work, we deleted columns according to corr score because of data size.