# <p style="background-color:#F8C1EE; font-family:newtimeroman; font-size:250%; text-align:center; border-radius: 15px 50px;"><b>Tabular Playground Series April</b> <br><br> EDA 🔍, Outliers, Correlations and Baseline 📈</p>

# <p style="background-color:#F8C1EE; font-family:newtimeroman; font-size:100%; text-align:center; border-radius: 15px 50px;">Please <u>upvote</u> if you find this notebook useful or interesting, I really appreciate the encouragement. Thanks!</p>

In [None]:
!pip install ethnicolr
!pip install pandas

In [None]:
#%%capture

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import LocalOutlierFactor
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import norm
import scipy.stats as st

try:
    from collections import OrderedDict
except ImportError:
    from ordereddict import OrderedDict
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from ethnicolr import census_ln, pred_census_ln

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
display(df_train.head())
df_train.describe()

In [None]:
print(df_train.columns)

In [None]:
cont_FEATURES = ['Age', 'Fare']

cat_FEATURES = ['Pclass', 'Sex']

# Cleaning the Dataset



### Invalid Values

In [None]:
df_train.info()

In [None]:
df_train.fillna(0)

From the above we can see that there are null values appearing. #TOOD: Tidy these up

# Outliers

In [None]:
def plot_outliers(df, feature, threshold=5):
    mean, std = np.mean(df), np.std(df)
    z_score = np.abs((df-mean) / std)
    good = z_score < threshold

    print(f"Rejection {(~good).sum()} points")
    visual_scatter = np.random.normal(size=df.size)
    plt.scatter(df[good], visual_scatter[good], s=2, label="Good", color="#4CAF50")
    plt.scatter(df[~good], visual_scatter[~good], s=8, label="Bad", color="#F44336")
    plt.legend(loc='upper right')
    plt.title(feature)
    plt.show();
    
    return good

def plot_lof_outliers(df, feature):
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.001, p=1)
    good = lof.fit_predict(df) > 0.5 # change this value to set the threshold for outliers
    print(f"Rejection {(~good).sum()} points")
    
    visual_scatter = np.random.normal(size=df.size)
    plt.scatter(df[good], visual_scatter[good], s=2, label="Good", color="#4CAF50")
    plt.scatter(df[~good], visual_scatter[~good], s=8, label="Bad", color="#F44336")
    plt.legend(loc='upper right')
    plt.title(feature)
    plt.show();
    
    return good

### Target Outliers

### Feature Outliers

In [None]:
for feature in cont_FEATURES:
    print(feature)
    plot_outliers(df_train[feature], feature)

We can see in the above that there aren't any reasonable outliers picked out. It has marked the high Fare's as outliers but we can see from the graph that this isn't a reasonabe thing to do.

In [None]:
for feature in cont_FEATURES:
    # There some reshaping done here for syntax sake
    data = df_train[~df_train[feature].isna()][feature]
    plot_lof_outliers(data.values.reshape(data.shape[0], -1), feature)

There are a few outliers here but because they seem to be mixed in with the group I am going to leave them in the dataset. I imagine the `Age` outliers are just due to some ages being `X.5`.

# Analysing Distributions

### Continuous Variables

In [None]:
for feature in cont_FEATURES:
    sns.violinplot(x='Survived', y=feature, data=df_train, inner='quartile');
    plt.title(feature)
    plt.show()

Here we can see that there are subtle differences in the `Fare` paid and whether someone survived. This could therefore be a very useful feature.

### Categorical Variables

In [None]:
for feature in cat_FEATURES:
    print(feature)
    sns.histplot(df_train[feature].values)
    plt.show()

This shows us that the classes are balanced enough that they won't cause any issues for our models.

# Empirical CDFs

In [None]:
def plot_cdf(df, feature):
    ps = 100 * st.norm.cdf(np.linspace(-4, 4, 10)) # The last number in this tuple is the number of percentiles
    x_p = np.percentile(df, ps)

    xs = np.sort(df)
    ys = np.linspace(0, 1, len(df))

    plt.plot(xs, ys * 100, label="ECDF")
    plt.plot(x_p, ps, label="Percentiles", marker=".", ms=10)
    plt.legend()
    plt.ylabel("Percentile")
    plt.title(feature)
    plt.show();

for feature in cont_FEATURES:
    plot_cdf(df_train[feature], feature)

Here we can see that there is a significant skew in the `Fare` variable where 80% of the fares are below 100.

# Correlation

In [None]:
# This plots a 16x16 matrix of correlations between all the features and the target
# Note: I sometimes comment this out because it takes a few minutes to run and doesn't show any useful information.

#pd.plotting.scatter_matrix(df_train, figsize=(10, 10));

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(df_train.drop(columns=['PassengerId']).corr(), annot=True, cmap='viridis', fmt='0.2f', ax=ax)

This heatmap shows us that there are some weak correlations between `Survived` and other variables. This could be useful to us, but also shows that there is now *silver bullet* feature that will completely solve our problems.

# Feature Engineering

Features to create:
- Passenger nationality (from their name)
- How good a deal did they get on their cabin?

### Categorical Features

Here we are extracting the letter and number from their class since this likely gives us a good idea of where there cabin was on the boat.

In [None]:
def get_class_letter(text):
    if str(text) != 'nan':
        return text[0]

def get_class_number(text):
    if str(text) != 'nan':
        return int(text[1:])

df_train['class_letter'] = df_train['Cabin'].apply(get_class_letter)
df_train['class_number'] = df_train['Cabin'].apply(get_class_number)


In [None]:
df_train['class_letter'].value_counts()

In [None]:
df_train['class_number'].value_counts()

In [None]:
dummies = pd.get_dummies(df_train['Embarked'])
for col in dummies.columns:
    df_train['embarked_' + col] = dummies[col] 
    
dummies = pd.get_dummies(df_train['class_letter'])
for col in dummies.columns:
    df_train['class_letter_' + col] = dummies[col] 

In [None]:
dummies = pd.get_dummies(df_train['Sex'])
for col in dummies.columns:
    df_train['sex_' + col] = dummies[col] 

### Passenger Ethnicity and Nationality

In [None]:
ethnicity_features = ['pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic']

df_train['last_name'] = df_train['Name'].apply(lambda x: x.split(',')[0])
df_train = census_ln(df_train, 'last_name')

In [None]:
# Replace the (S) values in the ethnicity columns with a 0
df_train[ethnicity_features] = df_train[ethnicity_features].replace('(S)', 0).astype(float)

In [None]:
for feature in ethnicity_features:
    sns.violinplot(x='Survived', y=feature, data=df_train, inner='quartile');
    plt.title(feature)
    plt.show()

The above shows us that there was an equal distribution for Survived and Died but the comparison between these features will be interesting to observe in the model training.

### Continuous Features

Now we want to normalise our continuous variables, since the values of `Fare` (e.g. 400) are obviously much larger than `Age` (e.g. 21).

In [None]:
for feature in ['Fare', 'Age', 'pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic']:
    x = df_train[feature].values.reshape(-1, 1) #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_train[feature] = pd.DataFrame(x_scaled)

# Baseline

In [None]:
print(df_train.columns)
df_train.head()


### Create out Train and Test Sets

In [None]:
model_FEATURES = ['Pclass', 'sex_female', 'sex_male', 'Age', 'SibSp', 'Parch', 'Fare', 
                  'embarked_C', 'embarked_Q', 'embarked_S',
                  'class_number', 'class_letter_A', 'class_letter_B', 'class_letter_C',
                  'class_letter_D', 'class_letter_E', 'class_letter_F', 'class_letter_G',
                  'class_letter_T', 'pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic']

X = df_train[model_FEATURES].fillna(0)
target = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.33, random_state=42)

In [None]:
# Logistic Regression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [None]:
print(f'Logistic Regression Score: {clf.score(X_test, y_test)}')

# Random Forrest

Random forrests are known for being the some of the best models for performing classification and so it is always good to experiment with these models.

In [None]:
rf = RandomForestClassifier(max_depth=4, random_state=123)
rf.fit(X_train, y_train)

In [None]:
print(f'Random Forest Score: {rf.score(X_test, y_test)}')

# Submission

### Preprocess Test Set

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

df_test['class_letter'] = df_test['Cabin'].apply(get_class_letter)
df_test['class_number'] = df_test['Cabin'].apply(get_class_number)

dummies = pd.get_dummies(df_test['Embarked'])
for col in dummies.columns:
    df_test['embarked_' + col] = dummies[col] 
    
dummies = pd.get_dummies(df_test['class_letter'])
for col in dummies.columns:
    df_test['class_letter_' + col] = dummies[col] 

dummies = pd.get_dummies(df_test['Sex'])
for col in dummies.columns:
    df_test['sex_' + col] = dummies[col] 
    
df_test['last_name'] = df_test['Name'].apply(lambda x: x.split(',')[0])
df_test = census_ln(df_test, 'last_name')    
df_test[ethnicity_features] = df_test[ethnicity_features].replace('(S)', 0).astype(float)

for feature in ['Fare', 'Age', 'pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic']:
    x = df_test[feature].values.reshape(-1, 1) #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_test[feature] = pd.DataFrame(x_scaled)

X = df_test[model_FEATURES].fillna(0)

In [None]:
predictions = rf.predict(X)

In [None]:
df_test.head()

In [None]:
df_predictions = pd.DataFrame(data={'PassengerId': df_test['PassengerId'], 'Survived': predictions})

In [None]:
df_predictions.head()

In [None]:
df_predictions.to_csv('predictions.csv', index=False)