# Titanic Survivor Prediction
---

# Installations

In [None]:
!pip install -q autoviz
!pip install -q -U --pre pycaret

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from autoviz.classify_method import data_cleaning_suggestions ,data_suggestions

from sklearn.model_selection import cross_val_score

In [None]:
train = pd.read_csv('trainv.csv')
test  = pd.read_csv('testv.csv')


# EDA

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
data_cleaning_suggestions(train)

In [None]:
# check which columns have missing values
null_cols = train.columns[train.isnull().any()]
null_cols

# fill missing values with mean value for each column
for col in null_cols:
    if train[col].dtype != object:
        train[col].fillna(train[col].mean(), inplace=True)

In [None]:
data_cleaning_suggestions(train)

In [None]:
train.fillna(0, inplace = True)

In [None]:
data_cleaning_suggestions(train)

In [None]:
train.drop(['Name', 'PassengerId'], axis = 1, inplace = True)

In [None]:
data_cleaning_suggestions(train)

In [None]:
test.drop(['Name', 'PassengerId'], axis = 1, inplace = True)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
def detect_outliers(data):
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)

# Correlation Matrix

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(train.corr(),annot=True)

In [None]:
def plots(df, variable):
  if df[variable].dtype != object:
    # define figure size
    fig, ax = plt.subplots(1, 5, figsize=(24, 4))
    
    # histogram
    sns.histplot(df[variable], bins=30, kde=True, ax=ax[0])
    ax[0].set_title('Histogram')
    
    # KDE plot
    sns.kdeplot(df[variable], ax=ax[1])
    ax[1].set_title('KDE Plot')
    
    # boxplot
    sns.boxplot(y=df[variable], ax=ax[3])
    ax[3].set_title('Boxplot')
    
    # scatterplot
    sns.scatterplot(x=df.index, y=df[variable], ax=ax[4])
    ax[4].set_title('Scatterplot')
    
    plt.tight_layout()
    plt.show()

for i in train.columns:
    plots(train ,i)

In [None]:
from sklearn.preprocessing import LabelEncoder
# handle categorical features
le = LabelEncoder()
train["Sex"] = le.fit_transform(train["Sex"])
train.Sex

# ML Model

In [None]:
train.columns

In [None]:
X = train.drop('Survived', axis = 1)
y = train['Survived']

In [None]:
import category_encoders as ce
target_encoder = ce.TargetEncoder()
X = target_encoder.fit_transform(X, y)

In [None]:
from pycaret.classification import *

In [None]:
setup(data = train,  target = 'Survived')

In [None]:
compare_models()

In [None]:
lda = create_model('lda')

In [None]:
evaluate_model(lda)

In [None]:
data = train.sample(frac=0.9, random_state=786).reset_index(drop=True)
data_unseen = train.drop(data.index).reset_index(drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
predict_model(lda)

In [None]:
unseen_predictions = predict_model(lda, data=data_unseen)
unseen_predictions.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
# handle categorical features
le = LabelEncoder()
test["Sex"] = le.fit_transform(test["Sex"])
test.Sex

In [None]:
from sklearn.model_selection import cross_val_score
# Evaluate the ensemble model using cross-validation
scores = cross_val_score(lda, X, y, cv=20)

In [None]:
predict_model(lda)

In [None]:
preds = predict_model(lda, data=test)

In [None]:
preds.shape

In [None]:
preds

In [None]:
sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
sub.shape

In [None]:
sub.drop('Survived', axis = 1, inplace = True)
# Separate column from source dataset
column_to_move = preds.pop('prediction_label')

In [None]:
# Add column to destination dataset
sub['Survived'] = column_to_move

In [None]:
sub.to_csv('submission.csv' ,index = False)
sub = pd.read_csv('/kaggle/working/submission.csv')
sub

# Thank You
___