<a href="https://www.kaggle.com/code/shriyaa07/titanic-dataset?scriptVersionId=144946221" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Bharat Intern Internship
# Task 1 : Titanic Classification
# Name: Shriya

# ****Data Preparation****

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [None]:
titanic = pd.read_csv('/kaggle/input/titanic-dataset/Titanic-Dataset.csv')

In [None]:
titanic

In [None]:
# Reading first 5 rows
titanic.head()

In [None]:
# Reading last 5 rows
titanic.tail()

In [None]:
# Showing no. of rows and columns of dataset
titanic.shape

In [None]:
# checking for columns
titanic.columns

# **Data Preprocessing and Data Cleaning**

In [None]:
# Checking for data types
titanic.dtypes

In [None]:
# checking for duplicated values
titanic.duplicated().sum()

In [None]:
# checking for null values
nv = titanic.isna().sum().sort_values(ascending=False)
nv = nv[nv>0]
nv

In [None]:
# Cheecking what percentage column contain missing values
titanic.isnull().sum().sort_values(ascending=False)*100/len(titanic)

In [None]:
# Since Cabin Column has more than 75 % null values .So , we will drop this column
titanic.drop(columns = 'Cabin', axis = 1, inplace = True)
titanic.columns

In [None]:
# Filling Null Values in Age column with mean values of age column
titanic['Age'].fillna(titanic['Age'].mean(),inplace=True)

# filling null values in Embarked Column with mode values of embarked column
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0],inplace=True)

In [None]:
# checking for null values
titanic.isna().sum()

In [None]:
# Finding no. of unique values in each column of dataset
titanic[['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked']].nunique().sort_values()

In [None]:
titanic['Survived'].unique()

In [None]:
titanic['Sex'].unique()

In [None]:
titanic['Pclass'].unique()

In [None]:
titanic['SibSp'].unique()

In [None]:
titanic['Parch'].unique()

In [None]:
titanic['Embarked'].unique()

# **Dropping Some Unnecessary Columns**
There are 3 columns i.e.. 'PassengerId' , 'Name' , 'Ticket' are unnecessary columns which have no use in data modelling . So, we will drop these 3 columns

In [None]:
titanic.drop(columns=['PassengerId','Name','Ticket'],axis=1,inplace=True)
titanic.columns

In [None]:
# Showing inforamation about the dataset
titanic.info()

In [None]:
# showing info. about numerical columns
titanic.describe()

In [None]:
# showing info. about categorical columns
titanic.describe(include='O')

# Data Visualization

In [None]:
d1 = titanic['Sex'].value_counts()
d1

In [None]:
# Plotting Count plot for sex column
sns.countplot(x=titanic['Sex'])
plt.show()

In [None]:
# Plotting Percantage Distribution of Sex Column
plt.figure(figsize=(5,5))
plt.pie(d1.values,labels=d1.index,autopct='%.2f%%')
plt.legend()
plt.show()

In [None]:
# Showing Distribution of Sex Column Survived Wise
sns.countplot(x=titanic['Sex'],hue=titanic['Survived']) # In Sex (0 represents female and 1 represents male)
plt.show()

In [None]:
# Showing Distribution of Embarked Sex wise
sns.countplot(x=titanic['Embarked'],hue=titanic['Sex'])
plt.show()

In [None]:
# Plotting CountPlot for Pclass Column
sns.countplot(x=titanic['Pclass'])
plt.show()

In [None]:
# Showing Distribution of Pclass Sex wise
sns.countplot(x=titanic['Pclass'],hue=titanic['Sex'])
plt.show()

In [None]:
# Age Distribution
sns.kdeplot(x=titanic['Age'])
plt.show()

In [None]:
# Plotting CountPlot for Survived Column
print(titanic['Survived'].value_counts())
sns.countplot(x=titanic['Survived'])
plt.show()

In [None]:
# Showing Distribution of Parch Survived Wise
sns.countplot(x=titanic['Parch'],hue=titanic['Survived'])
plt.show()

In [None]:
# Showing Distribution of SibSp Survived Wise
sns.countplot(x=titanic['SibSp'],hue=titanic['Survived'])
plt.show()

In [None]:
# Showing Distribution of Embarked Survived wise
sns.countplot(x=titanic['Embarked'],hue=titanic['Survived'])
plt.show()

In [None]:
# Showinf Distribution of Age Survived Wise
sns.kdeplot(x=titanic['Age'],hue=titanic['Survived'])
plt.show()

In [None]:
# Plotting Histplot for Dataset
titanic.hist(figsize=(10,10))
plt.show()

In [None]:
# Plotting Boxplot for dataset
# Checking for outliers
sns.boxplot(titanic)
plt.show()

In [None]:
# Plotting pairplot
sns.pairplot(titanic)
plt.show()

# Checking the target variable


In [None]:
titanic['Survived'].value_counts()

In [None]:
sns.countplot(x=titanic['Survived'])
plt.show()

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder
le = LabelEncoder()

# Apply label encoding to each categorical column
for column in ['Sex','Embarked']:
    titanic[column] = le.fit_transform(titanic[column])

titanic.head()
# Sex Column

# 0 represents female
# 1 represents Male

# Embarked Column

# 0 represents C
# 1 represents Q
# 2 represents S

# **Data Modelling**

In [None]:
# importing libraries

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

# **Selecting the independent and dependent Features**

In [None]:
cols = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
x = titanic[cols]
y = titanic['Survived']
print(x.shape)
print(y.shape)
print(type(x))  # DataFrame
print(type(y))  # Series

In [None]:
x.head()

In [None]:
y.head()

# **Train_Test_Split**

In [None]:
print(891*0.10)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.10,random_state=1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Creating Functions to compute Confusion Matrix, Classification Report and to generate Training and the Testing Score(Accuracy)

In [None]:
def cls_eval(ytest,ypred):
    cm = confusion_matrix(ytest,ypred)
    print('Confusion Matrix\n',cm)
    print('Classification Report\n',classification_report(ytest,ypred))

def mscore(model):
    print('Training Score',model.score(x_train,y_train))  # Training Accuracy
    print('Testing Score',model.score(x_test,y_test))     # Testing Accuracy

In [None]:
# Building the logistic Regression Model
lr = LogisticRegression(max_iter=1000,solver='liblinear')
lr.fit(x_train,y_train)

In [None]:
# Computing Training and Testing score
mscore(lr)

In [None]:
# Generating Prediction
ypred_lr = lr.predict(x_test)
print(ypred_lr)

In [None]:
# Evaluate the model - confusion matrix, classification Report, Accuracy score
cls_eval(y_test,ypred_lr)
acc_lr = accuracy_score(y_test,ypred_lr)
print('Accuracy Score',acc_lr)

In [None]:
# Building the knnClassifier Model
knn=KNeighborsClassifier(n_neighbors=8)
knn.fit(x_train,y_train)

In [None]:
# Computing Training and Testing score
mscore(knn)

In [None]:
# Generating Prediction
ypred_knn = knn.predict(x_test)
print(ypred_knn)

In [None]:
# Evaluate the model - confusion matrix, classification Report, Accuracy score
cls_eval(y_test,ypred_knn)
acc_knn = accuracy_score(y_test,ypred_knn)
print('Accuracy Score',acc_knn)

In [None]:
# Building Support Vector Classifier Model
svc = SVC(C=1.0)
svc.fit(x_train, y_train)

In [None]:
# Computing Training and Testing score
mscore(svc)

In [None]:
# Generating Prediction
ypred_svc = svc.predict(x_test)
print(ypred_svc)

In [None]:
# Evaluate the model - confusion matrix, classification Report, Accuracy score
cls_eval(y_test,ypred_svc)
acc_svc = accuracy_score(y_test,ypred_svc)
print('Accuracy Score',acc_svc)

In [None]:
# Building the RandomForest Classifier Model
rfc=RandomForestClassifier(n_estimators=80,criterion='entropy',min_samples_split=5,max_depth=10)
rfc.fit(x_train,y_train)

In [None]:
# Computing Training and Testing score
mscore(rfc)

In [None]:
# Generating Prediction
ypred_rfc = rfc.predict(x_test)
print(ypred_rfc)

In [None]:
# Evaluate the model - confusion matrix, classification Report, Accuracy score
cls_eval(y_test,ypred_rfc)
acc_rfc = accuracy_score(y_test,ypred_rfc)
print('Accuracy Score',acc_rfc)

In [None]:
# Building the DecisionTree Classifier Model
dt = DecisionTreeClassifier(max_depth=5,criterion='entropy',min_samples_split=10)
dt.fit(x_train, y_train)

In [None]:
# Computing Training and Testing score
mscore(dt)

In [None]:
# Generating Prediction
ypred_dt = dt.predict(x_test)
print(ypred_dt)

In [None]:
# Evaluate the model - confusion matrix, classification Report, Accuracy score
cls_eval(y_test,ypred_dt)
acc_dt = accuracy_score(y_test,ypred_dt)
print('Accuracy Score',acc_dt)

In [None]:
# Builing the Adaboost model
ada_boost  = AdaBoostClassifier(n_estimators=80)
ada_boost.fit(x_train,y_train)

In [None]:
# Computing the Training and Testing Score
mscore(ada_boost)

In [None]:
# Generating the predictions
ypred_ada_boost = ada_boost.predict(x_test)

In [None]:
# Evaluate the model - confusion matrix, classification Report, Accuracy Score
cls_eval(y_test,ypred_ada_boost)
acc_adab = accuracy_score(y_test,ypred_ada_boost)
print('Accuracy Score',acc_adab)

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression','knn','SVC','Random Forest Classifier','Decision Tree Classifier','Ada Boost Classifier'],
    'Score': [acc_lr,acc_knn,acc_svc,acc_rfc,acc_dt,acc_adab]})

models.sort_values(by = 'Score', ascending = False)

In [None]:
colors = ["blue", "green", "red", "yellow","orange","purple"]

sns.set_style("whitegrid")
plt.figure(figsize=(15,5))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=models['Model'],y=models['Score'], palette=colors )
plt.show()