In [None]:
import pandas as pd
from sklearn import model_selection

In [None]:
train_path = "../input/tabular-playground-series-mar-2021/train.csv"
test_path = "../input/tabular-playground-series-mar-2021/test.csv"

In [None]:
train = pd.read_csv(train_path)

In [None]:
train.head()

In [None]:
# Reference for this function: https://www.kaggle.com/kirillklyukvin/playground-series-february-21/notebook
def eda(df):
    
    """
    This function helps us with simple data analysis.
    We may explore the common information about the dataset, missing values, features distribution and duplicated rows
    """
    
    # applying info() method
    print('---')
    print('Common information')
    print('---')
    print()
    print(df.info())
    
    # missing values
    print()
    print('---')
    if df.isna().sum().sum() == 0:
        print('There are no missing values')
        print('---')
    else:
        print('Detected')
        display(df.isna().sum())
    
    
    # applying describe() method for categorical features
    print()
    print('---')
    print('Categorical columns')
    print('Total {}'.format(len(df.select_dtypes(include='object').columns)))
    print('---')
    display(df.describe(include = 'object'))
    
    # same describe() but for continious features
    print('---')
    print('Continuous columns')
    print('Total {}'.format(len(df.select_dtypes(include=['int', 'float']).columns)))
    print('---')
    display(df.describe())
    
    #checking for duplicated rows
    if df.duplicated().sum() == 0:
        print('---')
        print('There are no duplicates')
        print('---')
    else:
        print('---')
        print('Duplicates found')
        print('---')
        display(df[df.duplicated()])
    
    print()
    print('---')
    print('End of the report')

In [None]:
eda(train)

### Summary:
- We have 19 categorical columns
- We have 13 categorical columns
- There are no missing values
- There are no duplicates

### Target variable

In [None]:
train.target.nunique()

So, we are dealing with a binary classification here

In [None]:
train.target.value_counts()

#### Data is highly skewed. So, we need to perform a good cross validation in order to decide which model and experiments performs better. You can never trust the public leaderboard :P

## Stratified KFold Cross Validation

Since the data is highly skewed we will perform a Stratified KFold cross validation, which preserves the target distribution in each fold (which helps to avoid overfitting)

In [None]:
train["kfold"] = -1
train = train.sample(frac=1).reset_index(drop=True)
y = train.target.values

skf = model_selection.StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(skf.split(X=train, y=y)):
    train.loc[v_, 'kfold'] = f

train.to_csv("train_folds.csv", index=False)

### Sanity Check

You wanna make sure this split makes sense. We can do that by checking the stratification


In [None]:
train_folds_path = "./train_folds.csv"

In [None]:
train_folds = pd.read_csv(train_folds_path)

In [None]:
train_folds.head()

In [None]:
# checking if all the folds are evenly split
train_folds.kfold.value_counts()

#### Cool, the data seems to be evenly split

In [None]:
train_folds.head()

In [None]:
# checking the target distribution in each fold
for fold in range(5):
    print(f"====FOLD-{fold}====")
    df = train_folds[train_folds.kfold == fold]
    print(df.target.value_counts())
    print()

### Hurrayyy. We got the same distribution in each fold 🙌🙌🙌🙌

#### Now, you can train and test your different models on `train_folds.csv` and validate your different experiments and model's performance

### You should trust your CV more instead of Public Leaderboard. 
### You will less likely overfit the data and will not experience major shakeups in the private leaderboard

### Best luck for the competition
### Please show your love by a upvote 👍
### Peace ✌