# Introduction

Perform a Binary Classification over the Rocket League score chance of Team A and Team B.

In [None]:
# Import Standard Libraries
import pandas as pd
import os

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib.ticker as ticker
import seaborn as sns

# Set Pandas Options
pd.set_option('display.max_columns', 500)

In [None]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

In [None]:
# Notebook's variables
sample_size = 5000
input_path_prefix = '../input/tabular-playground-series-oct-2022/train_'

# Read Data

Since the data are to big for being all read, a 20% of sample data from each file would be taken into account and concatenated into a single DataFrame.

## Train Data

In [None]:
# Read train dtypes
train_dtypes_df = pd.read_csv('../input/tabular-playground-series-oct-2022/train_dtypes.csv')

# Convert the train dtypes DataFrame to a dictionary
train_dtypes_dict = dict(zip(train_dtypes_df.column, train_dtypes_df.dtype))

In [None]:
# Initialise empty train_data
train_data = pd.DataFrame(columns=train_dtypes_dict.keys())

# Fetch train input files
for index in range(10):
    
    # Train data path
    input_train_data_path = input_path_prefix + str(index) + '.csv'
        
    # Read data
    input_train_data = pd.read_csv(input_train_data_path,
                                   dtype=train_dtypes_dict,
                                   nrows=sample_size)
    
    # Concatenate
    train_data = pd.concat([train_data, input_train_data])

In [None]:
train_data.info()

In [None]:
train_data.head()

## Test Data

In [None]:
# Read test dtypes
test_dtypes_df = pd.read_csv('../input/tabular-playground-series-oct-2022/test_dtypes.csv')

# Convert the test dtypes DataFrame to a dictionary
test_dtypes_dict = dict(zip(test_dtypes_df.column, test_dtypes_df.dtype))

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-oct-2022/test.csv',
                        dtype=test_dtypes_dict)

# Exploratory Data Analysis

## Null Values Percentage

In [None]:
# Define data to display
data_to_display = {
    'Train Data': train_data,
    'Test Data': test_data
}

# Create figure
figure = plt.figure(tight_layout=True, figsize=(16, 20))

# Define figure's subplots
grid_spec = GridSpec(len(data_to_display), 1)

# Fetch data to plot
for row, (name, dataframe) in enumerate(data_to_display.items()):
    
    # Create subplot
    ax = figure.add_subplot(grid_spec[row, 0])
    
    sns.histplot(data=dataframe.isna().melt(var_name='Feature', value_name='Missing'), 
                 y='Feature', 
                 hue='Missing', 
                 multiple='fill',
                 ax=ax)
    
    # Refactor x label as missing percentage
    ax.xaxis.set_major_locator(ticker.LinearLocator(21))
    ax.xaxis.set_major_formatter('{:.0%}'.format)
    
    # Set title and label
    ax.set_title(name, fontsize=15)
    ax.set_xlabel('Null Percentage')

## Ball Position Distribution

In [None]:
# Define ball's position feature
ball_position = ['ball_pos_x', 'ball_pos_y', 'ball_pos_z']

In [None]:
# Define the subplot
figure, ax = plt.subplots(3, 1, figsize=(16, 9))
ax = ax.flatten()

# Fetch the data to plot
for index, column in enumerate(train_data[ball_position].columns):
    
    # Plot data
    sns.histplot(data=train_data[column], 
                 ax=ax[index])
    
    ax[index].set_title(' | '.join(combination), fontsize=14)
    
plt.tight_layout()