# Exploratory Data Analisys for Tabular Playground Series (Oct 2021)

In [None]:
# =======================================================
# TPS October 2021 - EDA
# =======================================================
# Name: Bárbara Sulpis
# Date: 11-oct-2021
# Description: I will analyze TPS data to have an idea of following steps...

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as st # statistical functions
import os

from sklearn.model_selection import train_test_split

#Lgbm
import lightgbm as lgb

# roc
import sklearn.metrics as metrics   # Para la curva ROC
import matplotlib.pyplot as plt     # Para la curva ROC

# for hystograms
import seaborn as sns


# ---------------------------
# Input data:
# Go to file -> add or upload data -> "Competition" data tab and select the commpetition which you want to add the csv data data "
# files are available in the read-only "../input/" directory
# ---------------------------

list =  os. getcwd()
print(list) # shoud be in "kaggle" directory

# I left this commented if you want to check that the files are there
# i = 0
# for subdir, dirs, files in os.walk('./'):
#     for file in files:
#         print(file)
#         i+= 1
#         if i>20: 
#             break


data = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")        
subm = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")  

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Size of the dataset
data.shape

In [None]:
# With this setting we can see all rows of the dataset
pd.set_option("display.max_columns", 300)
# We have a look to the data
data

In [None]:
# Before working with the data, we reduce the use of memory, so we can improve performance
# REFERENCE: https://www.kaggle.com/smiles28/tps10-optuna-xgb-catb-lgbm-stacking

# What the function does is to deduce the data types and cast each column to its most performant type

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
    
            # test if column can be converted to an integer
            asint = props[col].astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props


In [None]:
data = reduce_mem_usage(data)

In [None]:
subm = reduce_mem_usage(subm)

In [None]:
# ------------------------------------------------------------
#   Search for MISSING values
# ------------------------------------------------------------
# First we make a dataframe with the number of not-null values for each column
count = pd.DataFrame(data.count(), columns=['count'])
# Then we get the fields that has a number smaller than 1M (the number of rows in train set)
count.query("count < 1000000")

# As we can see there are not null values in the dataset. 
# That's good because we don't have to spend time working with missing values

In [None]:
# We can make the same check for the submission dataset ("test.csv dataset")
count = pd.DataFrame(subm.count(), columns=['count'])
count.query("count < 500000")

# As expected, there are not null values in test dataset neither.

In [None]:
# ------------------------------------------------------------
#   Variable CORRELATION
# ------------------------------------------------------------
# Correlation matrix
# --------------------
# We make a correlation matrix to check if there are relations between the different fields.
corrmat = data.corr()

In [None]:
# Let's draw the corrmat
f, ax = plt.subplots(figsize =(40, 40))
sns.heatmap(corrmat, ax = ax, cmap ="YlGnBu", linewidths = 0.1)
# Explanation of the graph: The blue diagonal "line" represents a 100% of correlation between each feature and itself
#   the other points, as the right vertical correlation rule indicates, seems not to have correlation with other features except of itself. 
# f22 seems to have no correlation at all with target, that could leat to the feature removal

In [None]:
# Distribution of the target:
data.groupby('target').count()
# 499515
# 500485
# The data is quite perfectly balanced

In [None]:
# ------------------------------------------------------------
#   Variable DISTRIBUTIONS
# ------------------------------------------------------------
# I will draw the hystograms for all variables
data.hist(grid = False, figsize=(25,80), layout=(29, 10), bins=50)

In [None]:
# ------------------------------------------------------------------------------
#  CARDINALITY OF VARIABLES
# ------------------------------------------------------------------------------
# After watching the output we can appreciate that there are plenty of features that seems to be binaries
# So, let's see theyr cardinality
pd.set_option("display.max_rows", 300)

data.nunique()

In [None]:
# What we can see below is that there are 45 binary features . 
# This is useful for example in case we use a LightGBM algorithm, we can specify the categorical features in order to improve results

# Example usage (LightGBM): 
# categorical = ['f22', 'f43', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248',
#                'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257',
#                'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266',
#                'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275',
#                'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284']

# fit_params={... 
#             'categorical_feature': categorical
#             ...
#            }  

In [None]:
# We can find handy this other histogram plot, that makes two plots overlapped
# Superposition of the two graphs: target==1 and target==0
# We will only plot the first 10 features

# REFERENCE: https://stackoverflow.com/questions/37911731/seaborn-histogram-with-4-panels-2-x-2-in-python

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data_hist = pd.melt(data[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'target']], "target", var_name="target distributions")
g = sns.FacetGrid(data_hist, hue="target", col="target distributions", col_wrap=5, sharex=False, sharey=False)
g.map(plt.hist, "value", bins=20, alpha=.4)


In [None]:
# ------------------------------------------------------------
#  Checking SKEWNESS for continuous data
# ------------------------------------------------------------
# Last of all, the following code is to calculate the skewed data. In this example left skewed data.
# This could be used to correct skewness with log or exponential transformations 

data_skewed = pd.concat([pd.DataFrame(data.columns), pd.DataFrame(st.skew(data))], axis=1)
data_skewed.columns = ['names', 'skewness']
# I only get fields that has a skewness bigger than 3
skewed = data_skewed.query('skewness > 3')['names']

In [None]:
skewed

In [None]:
# ------------------------------------------------------------
#  Best performing algorithms
# ------------------------------------------------------------
# As part of the EDA I can add the output of the LazyPredict used in other of my notebooks.
# REFERENCE: https://www.kaggle.com/brbarasulpis/tps-2021-oct-automl-lazypredict-lazyclassifier

# In the ouptut we can see that the best option got is LightGBM Classifier.
# If we perform an ensemble algorithm, we could use for example LightGBM+BernoulliNB+AdaBoostClassifier.

![](http://storage.googleapis.com/kagglesdsdata/datasets/1642245/2696798/LazyClassifier_Output.JPG?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=databundle-worker-v2%40kaggle-161607.iam.gserviceaccount.com%2F20211011%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20211011T220846Z&X-Goog-Expires=345599&X-Goog-SignedHeaders=host&X-Goog-Signature=36028cc47ab8ddc360a12680533209103048444e19d16eb41c8996032e865b1f9c28b5cac8432bcc9ca25c2333a710e25c651844ea159e0790b4f7bd780bc9d28eea421e2a9c3850c83ba20c921048dd25845f904ae90603e0bc226eceab6b9b141d66ea9d588ec88d64ceb5f3e25064e147bdb930fea38ba3841c3839d76a0a7f539b01fb3ab49ccfa49e1b007e64139500c00586fd494d98a4d0cb451869b5193e36262075617bfbbcf3a922a3d593542c911a259ef46c51ae45eaa66b71c1620131fbbb6ac544ade9204ea65e2fabc352c244a42a4336b721e2ecf94d6b70993ebd1e25c99603f64aeba92384c1ce88dac421a2a1a185e618e0aa8666b484)

# CONCLUSIONS
After this exploratory data analisys we now know that:
* There are no missing values in the dataset
* The target is balanced (nearly half values in 1 and half in 0)
* There is no correlations between the variables
* There are 45 categorical features
* Many continuous features are left skewed
* Best algorithm suggested for the problem is LightGBM