**Table of Contents**
1. Data Collection
2. Data Exploration(EDA)
3. Data Cleaning/Feature Engineering
4. Model Building


1. **Data Collection**

In [None]:
#Commonly Used Libraries
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import re #Regular expressions (Used for strings)
from collections import Counter #Counts amount of occurrences
import seaborn as sns  # Machine learning; Statistical graphics & visualizations
import xgboost as xgb  # Machine learning; Gradient boosted decision trees 
from scipy import stats # Stats

#Matplotlib
from matplotlib import pyplot as plt #Visualizations
import plotly.offline as py #Composing, editing, and sharing interactive data visualization 
from matplotlib import pyplot
py.init_notebook_mode(connected=True) #Enable plotly interactive plotting
import plotly.graph_objs as go #Graph Objetcs
import plotly.tools as tls #Tools
from collections import Counter #Counts amount of occurrences
import xgboost as xgb  # Machine learning; Gradient boosted decision trees 
import seaborn as sns  # Machine learning; Statistical graphics & visualizations

#Machine Learning Algorithims
import sklearn #Machine Learning
from sklearn.linear_model import LogisticRegression #Logistic Regression
from sklearn.svm import SVC, LinearSVC #Support Vector Classifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier) #Several Classifiers
from sklearn.neighbors import KNeighborsClassifier #KNeighbors
from sklearn.naive_bayes import GaussianNB #Assumes Gaussian Distribution
from sklearn.linear_model import Perceptron #Prediction Error
from sklearn.linear_model import SGDClassifier #Stochastic Gradient Descent
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier #Decision Tree Classifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold #Statistical Approaches
from sklearn.preprocessing import StandardScaler #Scales To Unit Variance
from sklearn.model_selection import train_test_split #Train-Test Split
from sklearn.metrics import accuracy_score,classification_report, precision_recall_curve, confusion_matrix #Statistical Approaches
from scipy.stats import zscore
from scipy.stats.mstats import winsorize

from sklearn.exceptions import ConvergenceWarning 
ConvergenceWarning('ignore')

In [None]:
#Read data
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

train.head()

2. **Data Exploration(EDA)**

In [None]:
#Distribution of numerical independent variables
numerical_columns = ['Age', 'Fare', 'Parch', 'SibSp']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
fig.suptitle("Distribution of Numerical Variables", fontsize=16)
for i,column in enumerate(numerical_columns):
    sns.histplot(train[column], bins = 20, kde = True, ax = axes[i//2,i%2])
    sns.despine()

plt.show()

In [None]:
#Feature Engineering With Relevant Themes
#Sex
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
train['Embarked'] = train ['Embarked'].map({'S':0,'C':1,'Q':2})

#Family
train['Family'] = train['SibSp'] + train['Parch'] + 1

#Make Age Categorical
age_bins = [0,9,19,29,39,49,59,69,89]
age_labels = ['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70+']
train['AgeCategory'] = pd.cut(train['Age'], bins = age_bins, labels = age_labels, right = False)

#Make Fare Categorical
fare_bins = [0,49,99,150]
fare_labels = ['0-49','50-100','100+']
train['FareCategory'] = pd.cut(train['Fare'], bins = fare_bins, labels = fare_labels, right = False)

#Relevant variables
categorical_vars = ['Pclass', 'Sex', 'AgeCategory', 'FareCategory', 'Family', 'Embarked',]

#Create a subplot with multiple categorical variables
fig, axes = plt.subplots(nrows = 2, ncols = 3, figsize=(15, 10))
axes = axes.flatten()

# Iterate over each categorical variable and create a countplot
for i, var in enumerate(categorical_vars):
    sns.countplot(x=var, hue='Survived', data=train, ax=axes[i])
    axes[i].set_title(f'Survival Counts by {var}')
    axes[i].set_xlabel(var)
    axes[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
#Show Survival Rates for different features
columns_to_plot = ['FareCategory', 'AgeCategory', 'Family', 'Pclass', 'Sex', 'Embarked']
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, column in enumerate(columns_to_plot):
    sns.barplot(x=column, y='Survived', data=train, ax=axes[i])
    axes[i].set_title(f'Survival Rate by {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Survival Rate')

plt.tight_layout()
plt.show()

3. **Data Cleaning**

In [None]:
# Deal with Outliers
numerical_columns = ['Age', 'SibSp', 'Parch', 'Fare']

def remove_outliers_iqr(data, columns):
    replaced_outliers_count = {}  # Initialize the dictionary to store the count of replaced outliers
    
    for column in columns: 
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Count the outliers before replacement
        outliers_before = sum((data[column] < lower_bound) | (data[column] > upper_bound))
        # Replace the outliers
        data[column] = data.apply(lambda row: lower_bound if row[column] < lower_bound else (upper_bound if row[column] > upper_bound else row[column]), axis=1)
        # Count the outliers after replacement
        outliers_after = sum((data[column] < lower_bound) | (data[column] > upper_bound))
        # Store the count of replaced outliers
        replaced_outliers_count[column] = outliers_before - outliers_after
    
    return data, replaced_outliers_count

replace_outliers_train, replaced_outliers_count_train = remove_outliers_iqr(train.copy(), numerical_columns)
replace_outliers_test, replaced_outliers_count_test = remove_outliers_iqr(test.copy(), numerical_columns)

# Print the counts of replaced outliers
print("Replaced Outliers in Train Data:")
print(replaced_outliers_count_train)

print("\nReplaced Outliers in Test Data:")
print(replaced_outliers_count_test)

# Ensure no data was accidentally deleted
print("\nOriginal Train Shape:", train.shape)
print("Train Shape After Outlier Replacement:", replace_outliers_train.shape)

print("Original Test Shape:", test.shape)
print("Test Shape After Outlier Replacement:", replace_outliers_test.shape)


In [None]:
#Replace null values for both sets
#Train set
train_age_median = train['Age'].median()
train['Age'] = train['Age'].replace(np.nan, train_age_median)

train_fare_median = train['Fare'].median()
train['Fare'] = train['Fare'].replace(np.nan, train_fare_median)

#Test set
test_age_median = test['Age'].median()
test['Age'] = test['Age'].replace(np.nan, test_age_median)

test_fare_median = test['Fare'].median()
test['Fare'] = test['Fare'].replace(np.nan, test_fare_median)

test.isnull().sum()

In [None]:
#Match Feature Engineering from training Data
#Sex
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

#Family 
test['Family'] = test['SibSp'] + test['Parch'] + 1

#Make Age Categorical
age_bins = [0,9,19,29,39,49,59,69,89]
age_labels = ['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70+']
test['AgeCategory'] = pd.cut(test['Age'], bins = age_bins, labels = age_labels, right = False)

#Make Fare Categorical
fare_bins = [0,49,99,150]
fare_labels = ['0-49','50-100','100+']
test['FareCategory'] = pd.cut(test['Fare'], bins = fare_bins, labels = fare_labels, right = False)
 
test.head()

In [None]:
#Replace null values for both new Columns
#Train Set
train_agecategory_mode = train['AgeCategory'].mode().iloc[0]
train['AgeCategory'] = train['AgeCategory'].fillna(train_agecategory_mode)

train_farecategory_mode = train['FareCategory'].mode().iloc[0]
train['FareCategory'] = train['FareCategory'].fillna(train_farecategory_mode)

#Test Set
test_agecategory_mode = test['AgeCategory'].mode().iloc[0]
test['AgeCategory'] = test['AgeCategory'].fillna(test_agecategory_mode)

test_farecategory_mode = test['FareCategory'].mode().iloc[0]
test['FareCategory'] = test['FareCategory'].fillna(test_farecategory_mode)

#Display nulls
train.isnull().sum()

In [None]:
#Drop Unnecessary Columns from both sets of data
#Train Set
train_columns_to_drop = ['PassengerId','Name','Age','Fare','SibSp','Parch','Ticket','Cabin','Embarked']
train = train.drop(columns = train_columns_to_drop)
#Test Set
test_columns_to_drop = ['Name','Age','Fare','SibSp','Parch','Ticket','Cabin','Embarked']
test = test.drop(columns = test_columns_to_drop)

In [None]:
#Convert data types for both sets of data
#Train Set
train['AgeCategory'] = train['AgeCategory'].cat.codes.astype('Int64')
train['FareCategory'] = train['FareCategory'].cat.codes.astype('Int64')
#Test Set
test['AgeCategory'] = test['AgeCategory'].cat.codes.astype('Int64')
test['FareCategory'] = test['FareCategory'].cat.codes.astype('Int64')

print(train.dtypes)
print(test.dtypes)

In [None]:
#View Train Set
train.head()

In [None]:
#View Test Set
test.head()

4. **Model Building**

In [None]:
#Pearson correlation to see relationships between variables
colormap = plt.cm.RdBu
plt.figure(figsize = (14,12))
plt.title('Pearson Correlation of Features', y = 1.05, size = 15)
sns.heatmap(train.astype(float).corr(),linewidths = .1, vmax = 1.0, square = True, cmap = colormap, linecolor = 'white', annot = True)

In [None]:
#Run different classification models
#Create survived column in test data
test['Survived'] = ''

train_data = train.drop('Survived', axis=1)
test_data = test.drop(['Survived', 'PassengerId'], axis=1)  # Remove 'PassengerId' from test data
target = train['Survived']

#Use K-fold cross validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
classifiers =[
    KNeighborsClassifier(n_neighbors=13),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=13),
    GaussianNB(),
    SVC(),
    ExtraTreeClassifier(),
    GradientBoostingClassifier(n_estimators=10, learning_rate=1, max_features=3, max_depth=3, random_state=10),
    AdaBoostClassifier(),
    ExtraTreesClassifier()
    ]

#Create 
def model_fit():
    scoring = 'accuracy'
    for i in range(len(classifiers)):
        score = cross_val_score(classifiers[i], train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
        print("Score of Model", i, ":", round(np.mean(score) * 100, 2))
model_fit()

#Select the best classifier for test data
final_classifier = SVC()
final_classifier.fit(train_data, target)

# Predict test data
test_prediction = final_classifier.predict(test_data)
test['Survived'] = test_prediction

In [None]:
#Create submission data frame
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': test_prediction})

#Specify file path and save to csv
file_path = '/kaggle/working/submission.csv'
submission.to_csv(file_path, index=False)
print("Successful Submission")