In [None]:
#Imporing Necessary Libraries

import pandas as pd

import numpy as np

Data Analysis

In [None]:
#Importing Train data

train=pd.read_csv("/kaggle/input/titanic/train.csv")

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train["Name"].unique()

Feature Engineering

In [None]:
#Creating a function to extract titles from Name Column and Create a new Feature Named Title

def extract_title(df):

    df['Title'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')

    df['Title'] = df['Title'].replace('Ms', 'Miss')

    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    return df



train = extract_title(train)

In [None]:
#Dropping Unnecessary features from the Train data

train=train.drop('Ticket',axis=1)

train=train.drop("PassengerId",axis=1)

train=train.drop("Cabin",axis=1)

train=train.drop("Name",axis=1)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
#Splitting Train data into X_train and Y_train

X_train=train.iloc[:,1:]

Y_train=train.iloc[:,:1]

In [None]:
Y_train

In [None]:
X_train

Preprocessing Pipelines

In [None]:
#Importing Necessary Libraries

from sklearn.preprocessing import OneHotEncoder,StandardScaler

from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

In [None]:
#Creating Numerical and Categorical features for The preprocessing Pipeline

num_feature=["Pclass","Age","SibSp","Parch","Fare"]

cat_feature=["Sex","Embarked","Title"]

In [None]:
#Creating Numerical and Categorical Pipelines

num_transformer=Pipeline(steps=[

    ('imputer',SimpleImputer(strategy='median')),

    ('scaler',StandardScaler())

])

cat_transformer=Pipeline(steps=[

    ('imputer',SimpleImputer(strategy='most_frequent')),

    ('encoder',OneHotEncoder())

])


In [None]:
#Using Column transformer to preprocess all the required features

preprocessor=ColumnTransformer(

    transformers=[

        ('num',num_transformer,num_feature),

        ('cat',cat_transformer,cat_feature)

    ]

)

In [None]:
#Fiiting the Preprocessor pipeline into X_train

transformed_X_train=preprocessor.fit_transform(X_train)

Model Selection and Training

In [None]:
#importing necessary libraries

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

In [None]:
#Using default parameters for the models

Log=LogisticRegression()

svm=SVC()

Forest=RandomForestClassifier()

XGB=GradientBoostingClassifier()

In [None]:
#Fitting Different Models

In [None]:
Forest.fit(transformed_X_train,Y_train)

In [None]:
XGB.fit(transformed_X_train,Y_train)

In [None]:
Log.fit(transformed_X_train,Y_train)

In [None]:
svm.fit(transformed_X_train,Y_train)

In [None]:
#Scoring Different Models

In [None]:
Forest.score(transformed_X_train,Y_train)

In [None]:
XGB.score(transformed_X_train,Y_train)

In [None]:
Log.score(transformed_X_train,Y_train)

In [None]:
svm.score(transformed_X_train,Y_train)

Analysis Of Best Model

In [None]:
#Importing Cross Val score

from sklearn.model_selection import cross_val_score

In [None]:
#Creating Variables to store the Cv score of different models based on train data

CV_score_svm=cross_val_score(svm,transformed_X_train,Y_train,cv=4)

CV_score_Forest=cross_val_score(Forest,transformed_X_train,Y_train,cv=4)

CV_score_XGB=cross_val_score(XGB,transformed_X_train,Y_train,cv=4)

CV_score_Log=cross_val_score(Log,transformed_X_train,Y_train,cv=4)

In [None]:
#Printing the Cross Validation Scores

print("Support Vector Classification Score: ",CV_score_svm)

print("Random Forest Classification Score: ",CV_score_Forest)

print("Logistic Regression Score: ",CV_score_Log)

print("Gradient Boost Score: ",CV_score_XGB)

In [None]:
#import The Libraries

import matplotlib.pyplot as plt



# Create a figure and axis

fig, ax = plt.subplots()



# Plot the classification scores

ax.plot(CV_score_svm, label='Support Vector')

ax.plot(CV_score_Forest, label='Random Forest')

ax.plot(CV_score_Log, label='Logistic Regression')

ax.plot(CV_score_XGB, label='Gradient Boost')



# Set title and labels

ax.set_title('Classification Score Comparison')

ax.set_xlabel('Fold')

ax.set_ylabel('Score')



# Legend

ax.legend()



# Show the plot

plt.show()

Preprocessing The Test data

In [None]:
#Importing The test data into "test"

test=pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
#Copying The actual test data before making any changes

test_copy=test

In [None]:
test.info()

In [None]:
#Using Extract title function to create a new Feature Title

test=extract_title(test)

In [None]:
#Dropping Unnecessary Features from Test Data

test=test.drop("PassengerId",axis=1)

test=test.drop("Name",axis=1)

test=test.drop("Ticket",axis=1)

test=test.drop("Cabin",axis=1)

test.head()

In [None]:
test.info()

In [None]:
#Using The Preprocessor Pipeline to Preprocess the Test data

transformed_X_test=preprocessor.transform(test)

In [None]:
#Getting the shape of the Transformed_X_test

transformed_X_test.shape

Predictions and exporting the final result

In [None]:
final_predictions=svm.predict(transformed_X_test)

In [None]:
#Creating a Dataframe to save the output

output=pd.DataFrame({

    'PassengerId':test_copy.PassengerId,

    'Survived': final_predictions

})

In [None]:
#Exporting the dataframe to a CSV file named Submission.CSV

output.to_csv("submission.csv",index=False)

print("Your Submission was Sucessfully Saved!!!!")

In [None]:
output.info()