# Titanic - Machine Learning from Disaster

In [1]:
# Data processing
import numpy as np
import pandas as pd

## Raw Data

In [2]:
df_train = pd.read_csv('raw_data/train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_test = pd.read_csv('raw_data/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Target Selection

Select the target variable. In this instance it is the 'Survived' column.

## Exploratory Data Analysis(EDA)

1. Check the datatypes of our dataset

In [None]:
col_counts = df_train.dtypes.groupby(df_train.dtypes).count()
col_counts_df = pd.DataFrame({'Data Type': col_counts.index.astype(str), 'Count': col_counts.values})
col_counts_df

2. Inspect the numerical variables

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64']).columns

# Create a list to store the results for each numeric column
results_list = []

# Iterate over each numeric column in the DataFrame
for col in numeric_df:
    col_dtype = str(df[col].dtype)

    # Determine the types of values in the column
    non_nan_count = df[col].count()
    nan_count = df[col].isna().sum()
    zero_count = len(df[df[col] == 0])
    
    # If there are non-NaN values in the column, calculate the statistics
    if non_nan_count > 0:
        non_nan_values = df[col].dropna()
        mean = non_nan_values.mean()
        std = non_nan_values.std()
        minimum = non_nan_values.min()
        percentile25 = np.percentile(non_nan_values, 25)
        median = np.percentile(non_nan_values, 50)
        percentile75 = np.percentile(non_nan_values, 75)
        maximum = non_nan_values.max()
        num_upper_outliers = len(df[df[col] > percentile75 + 1.5*(percentile75-percentile25)])
        num_lower_outliers = len(df[df[col] < percentile25 - 1.5*(percentile75-percentile25)])
        skewness = skew(non_nan_values)
    
    # Add the results to the results list for this column
    results_list.append([col, col_dtype, non_nan_count, nan_count, zero_count, mean, std, minimum, percentile25, median, percentile75, maximum, num_lower_outliers, num_upper_outliers, skewness])

# Create the result DataFrame from the list of results
numeric_summary_df = pd.DataFrame(results_list, columns=["Column", "Column Type", "Non-NaN Count", "NaN Count", "Zero Count", "Mean", "Std", "Min", "25%", "Median", "75%", "Max", "Num Lower Outliers", "Num Upper Outliers", "Skew"])

numeric_summary_df


3. Inspect the Categorical variables