# Data and Library Imports 📚

In [None]:
! pip install -q rich dabl

In [None]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from rich import print as _pprint
from rich.progress import track
from tqdm import tqdm
from colorama import Fore, Style
import random
import dabl
from wordcloud import WordCloud

import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from IPython.display import HTML

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

f = open("../input/notebookassets/orange.css").read()
HTML(f"<style>{f}</style>")

In [None]:
def cprint(string):
    _pprint(f"[black]{string}[/black]")
    
def cout(string: str, color=Fore.RED, end='\n'):
    """
    Saves some work
    """
    print(color+string+Style.RESET_ALL, end=end)

def stats(scol, col):
    cout(f"Average Value in the Column: {scol} is: {np.mean(col):.4f}", Fore.RED)
    cout(f"Median Value in the Column: {scol} is: {np.median(col):.4f}", Fore.BLUE)
    cout(f"Maxmimum Value in the Column: {scol} is: {np.max(col):.4f}", Fore.GREEN)
    cout(f"Minimum Value in the Column: {scol} is: {np.min(col):.4f}", Fore.YELLOW)
    cout(f"50th Quantile of the Column: {scol} is: {np.quantile(col, 0.5):.4f}", Fore.CYAN)
    cout(f"75th Quantile of the Column: {scol} is: {np.quantile(col, 0.75):.4f}", Fore.MAGENTA)

In [None]:
cprint('[red bold]If you like my notebook, please leave an Upvote![/red bold]')

In [None]:
train_file = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test_file = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")

train_file.head()

# Exploratory Data Analysis 📊

Let's now start with exploratory data analysis!

In [None]:
train_file.describe()

<div class="alert alert-block alert-info">
    This month's competition's data can be thought of as an extension to the Titanic competition data.
    The points in this dataset is generated by a CTGAN.
</div>

<div class="alert alert-block alert-danger">
    The statistical properties of this dataset are very similar to the original Titanic dataset, but there's no way to "cheat" by using public labels for predictions.
</div>

### Null Value Percentage in different columns

In [None]:
cprint("[cyan]Percentage of Null values in every column:[/cyan]")
for col in train_file.columns:
    percent_null = (train_file[col].isna().sum() / train_file.shape[0]) * 100
    print("Percentage of Null values in", end=' ')
    cout(f"{col}", Fore.GREEN, end=' ')
    print("column:", end=' ')
    cout(f"{percent_null} %", Fore.RED)

As we can see from above, <strong style="color:green;">Cabin</strong> has more than **67%** of it's values Null.

I'm thus dropping the column for now (may include later, idk)

### Target Class - `Survived` Distribution

In [None]:
sns.set_style('whitegrid')
names = ["Didn't Survive", "Survived"]
values = train_file['Survived'].value_counts().tolist()

plt.figure(figsize=(9, 9))
plt.pie(x=values, labels=names, autopct="%1.2f%%", colors=["blue", "green"], shadow=True)
plt.title("Survived Passengers Pie-Chart", fontdict={'fontsize': 14})
plt.show()

### Passenger Class (`Pclass`) column

This column represents the Ticket class of a passenger.

In [None]:
sns.set(style="whitegrid")
names = ["3rd Class", "1st Class", "2nd Class"]
values = train_file['Pclass'].value_counts().tolist()

plt.figure(figsize=(9, 9))
plt.pie(x=values, labels=names, autopct="%1.2f%%", colors=["#1f6193", "#0b8bed", "#307bb5"], shadow=True)
plt.title("Passenger Classes Pie-Chart", fontdict={'fontsize': 14})
plt.show()

### Age (`Age`) Column

Before we plot the age column, we must remove the null values by either dropping them or by imputing them.
I choose the latter option.

In [None]:
train_file['Age'] = train_file['Age'].fillna(train_file['Age'].mean())

plt.style.use("classic")
plt.figure(figsize=(9, 8))
sns.histplot(train_file['Age'], color='blue', kde=True, bins=35)
plt.axvline(train_file['Age'].mean(), color='pink', linestyle='-', linewidth=0.9)
min_ylim, max_ylim = plt.ylim()
plt.text(train_file['Age'].mean()*1.05, max_ylim*0.95, 'Mean (μ): {:.2f} years'.format(train_file['Age'].mean()))
plt.xlabel("Age (in years)")
plt.title(f"Distribution of Ages")
plt.show()

### Gender (`Sex`) Column 

In [None]:
sns.set(style="whitegrid")
labels = ['Male', 'Female']
values = train_file['Sex'].value_counts().tolist()

plt.figure(figsize=(9, 9))
plt.pie(x=values, labels=labels, autopct="%1.2f%%", colors=['blue', 'magenta'], explode=[0, 0.005], shadow=True)
plt.title("Gender Distribution Pie Chart", fontdict={'fontsize': 14})
plt.show()

### Siblings / Spouses Aboard the Titanic (`SibSp`)

In [None]:
sns.set(style="whitegrid")
labels = train_file['SibSp'].value_counts().index.tolist()
values = train_file['SibSp'].value_counts().tolist()

colors=['#03045E', '#023E8A', '#0077B6', '#0096C7', '#00B4D8', '#48CAE4', '#90E0EF']

plt.figure(figsize=(9, 9))
text, patches = plt.pie(x=values, labels=None, colors=colors, explode=[0, 0, 0, 0, 0.1, 0.2, 0.3])
plt.legend(labels=labels)

plt.title("Number of Siblings/Spouses Aboard", fontdict={'fontsize': 14})
plt.show()

### Parents / Childrens aboard the Titanic (`Parch`)

In [None]:
sns.set(style="whitegrid")
labels = train_file['Parch'].value_counts().index.tolist()
values = train_file['Parch'].value_counts().tolist()

colors = ['#b5179e', '#7209b7', '#560bad', '#480ca8', '#3a0ca3', '#3f37c9', '#4361ee', '#4895ef'][::-1]

plt.figure(figsize=(9, 9))

text, patches = plt.pie(x=values, labels=None, colors=colors)
plt.legend(labels=labels)

plt.title("Number of Parents/Children Aboard", fontdict={'fontsize': 14})
plt.show()

### Top-10 Most Popular Tickets (`Ticket`)

In [None]:
ixs = train_file['Ticket'].value_counts().index.tolist()[:10]
cat5_fl = train_file[train_file['Ticket'].isin(ixs)]['Ticket']
sns.set_style('whitegrid')
plt.figure(figsize=(8, 7))
ax = sns.histplot(cat5_fl, color='orange')
plt.xlabel("Category")
plt.ylabel("Count")
plt.title("Top-10 most popular Tickets")

total = len(train_file['Ticket'])
for p in ax.patches:
        percentage = '{:.2f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 5.2
        y = p.get_y() + p.get_height() + 0.05
        ax.annotate(percentage, (x, y))

plt.show()

### Embarked at (`Embarked`)

In [None]:
ixs = train_file['Embarked'].value_counts().index.tolist()
cat5_fl = train_file[train_file['Embarked'].isin(ixs)]['Embarked']

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 7))
fig.suptitle("Embarked Station", fontdict={'fontsize': 20})

sns.histplot(cat5_fl, color='magenta', ax=ax[0])
plt.xlabel("Station")
plt.ylabel("Count")

labels = train_file['Embarked'].value_counts().index.tolist()
values = train_file['Embarked'].value_counts().tolist()

ax[1].pie(x=values, labels=labels, autopct="%1.1f%%", colors=['#0742f2', '#093ac1', '#072c91'])

plt.show()

### First and Last Names

Let's take a look at what first and last names are popular.

<div class="alert alert-block alert-info">
    Keep in mind that in the <code>Name</code> column, the first string is actually the Last Name (or surname)
    and the second string is the First name.
</div>

In [None]:
# Get a list of first and last names from the dataframe
first_names = []
last_names = []
for name in train_file['Name']:
    first_names.append(name.split(',')[1][1:])
    last_names.append(name.split(',')[0])

idx = random.randint(a=0, b=len(first_names)-1)
print(f"Example name: {first_names[idx]} {last_names[idx]}")

In [None]:
# Wordcloud
fnames = " ".join(first_names)
lnames = " ".join(last_names)
fwc = WordCloud(width=1024, height=1024, collocations=False).generate(fnames)
lwc = WordCloud(width=1024, height=1024, collocations=False).generate(lnames)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10))
# fig.suptitle("First and Last Names WordClouds")

ax[0].imshow(fwc)
ax[0].axis('off')
ax[0].set_title('First Names')

ax[1].imshow(lwc)
ax[1].axis('off')
ax[1].set_title('Last Names')

plt.show()

## Data Preprocessing

Let's do some really quick data processing so we can focus more on modelling part!

For this stage, I will be doing the following:
1. Dropping features that aren't much use or have a lot of missing values
2. Dropping any remaining NaN values
3. Encoding any Categorical Features

In [None]:
def process_data(data: pd.DataFrame, is_test=False):
    if is_test:
        ids = data['PassengerId'].values
    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    data = data.dropna()
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
    data['Embarked'] = data['Embarked'].map({'Q': 0, 'C': 1, 'S': 2})
    
    if is_test:
        return (data, ids)
    else:
        return (data, None)

train_data, _ = process_data(train_file)
test_data, test_ids = process_data(test_file, is_test=True)

## Modelling using PyTorch Lightning

Let's make a Simple Deep Feed forward Neural Network using PyTorch Lightning and train it on GPUs or TPUs!

<div class="alert alert-block alert-info">
    Keep in mind, the model in this notebook isn't very great but it's here to show you guys how you can build a PyTorch lightning model for this comeptition!
</div>

<div class="alert alert-block alert-danger">
    I highly encourage you all to fork this notebook (if you do, please leave an upvote!) and change the model architecture and maybe even fine-tune them to get better results.
</div>

In [None]:
# Custom dataset
class TPSData(Dataset):
    def __init__(self, data: pd.core.frame.DataFrame, is_test: bool=False):
        self.is_test = is_test
        self.target = data['Survived'].values
        self.features = data.drop(['Survived'], axis=1).values
    
    def __getitem__(self, idx):
        data = self.features[idx]
        if self.is_test:
            return torch.tensor(data, dtype=torch.float32)
        else:
            target = self.target[idx]
            return torch.tensor(data, dtype=torch.float32), torch.tensor(target, dtype=torch.long)
    
    def __len__(self):
        return len(self.features)

In [None]:
def get_datasets(data: pd.core.frame.DataFrame, split: int=0.2):
    """
    Split the data into training and validation splits
    Make them into Torch Dataset format
    """
    # Shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Split the data
    split_nb = int(split * len(data))
    train_split = data[split_nb:]
    val_split = data[:split_nb]
    
    # Make them Torch Datasets
    training_set = TPSData(
        train_split,
        is_test=False
    )
    validation_set = TPSData(
        val_split,
        is_test=False
    )
    
    return {'train': training_set, 'val' : validation_set}

In [None]:
data_config = {
    'data': train_data,
    'split_pcent': 0.2,
    'data_ret': get_datasets,
    'num_workers': 4,
    'train_bs': 64,
    'val_bs': 128
}

In [None]:
# Model
class TPSModel(pl.LightningModule):
    def __init__(self,
                 input_size: int = 7, 
                 classes: int = 2,
                 learning_rate: float = 1e-3,
                 data_config: dict = data_config
        ):
        super(TPSModel, self).__init__()
        
        if not data_config:
            raise ValueError("Data Config Cannot be empty")
        
        self.data_config = data_config
        self.input_size = input_size
        self.learning_rate = learning_rate
        
        # Mode Architecture
        self.fc1 = nn.Linear(self.input_size, 1024)
        self.fc2 = nn.Linear(1024, 768)
        self.fc3 = nn.Linear(768, 128)
        self.fc4 = nn.Linear(128, classes)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # Model Compuatation Code
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        
        return out

    def prepare_data(self):
        """
        Get the datasets related variables and functions from the data config dictionary
        Then use it to split data.
        """
        # Get stuff from dict
        data = self.data_config['data']
        split_pcent = self.data_config['split_pcent']
        data_ret_fn = self.data_config['data_ret']
        
        # Call the retriever function to split data and make datasets
        # Also extract the datasets from the returned dictionary
        dataset_cache = data_ret_fn(data, split_pcent)
        self.train_set = dataset_cache['train']
        self.val_set = dataset_cache['val']
        
    def train_dataloader(self):
        """
        Initializes and returns the training dataloader
        """
        num_workers = self.data_config['num_workers']
        train_bs = self.data_config['train_bs']
        
        train_loader = DataLoader(
            dataset = self.train_set,
            shuffle = True,
            batch_size = train_bs,
            num_workers = num_workers
        )
        
        return train_loader
        
    def val_dataloader(self):
        """
        Initializes and returns the validation dataloader
        """
        num_workers = self.data_config['num_workers']
        val_bs = self.data_config['val_bs']
        
        val_loader = DataLoader(
            dataset = self.val_set,
            shuffle = False,
            batch_size = val_bs,
            num_workers = num_workers,
        )
        
        return val_loader
    
    def training_step(self, batch, batch_idx):
        data, targets = batch
        outputs = self(data)
        loss = F.cross_entropy(outputs, targets)
        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        data, targets = batch
        outputs = self(data)
        val_loss = F.cross_entropy(outputs, targets)
        return {'val_loss': val_loss}
        
    def validation_epoch_end(self, outputs):
        # 'outputs' is a list of dictionaries containing validation loss of each batch
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        # return {'val_loss': avg_loss}
        # self.log(f"Average loss: {avg_loss}")
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [None]:
# Run the training loop
model = TPSModel()
trainer = pl.Trainer(max_epochs=10, gpus=1)
trainer.fit(model)

In [None]:
cprint('[red bold]If you like my notebook, please leave an Upvote![/red bold]')