<a href="https://colab.research.google.com/github/Omoadonibetty/Machine-Learning-on-Titanic-Dataset/blob/main/titanic_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split

In [2]:
# Mounting Google Drive to access dataset
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Loading the Titanic dataset
titanic = pd.read_csv('/content/titanic.csv')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Displaying the first few rows of the dataset
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Dropping unnecessary columns from the dataset
titan = titanic.drop(['Name', 'PassengerId','Ticket', 'Cabin', 'Embarked', 'SibSp', 'Parch'], axis = "columns")
titan

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,,23.4500
889,1,1,male,26.0,30.0000


In [6]:
## Checking for missing values in the dataset
titan.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [7]:
# getting the median of age
titan['Age'].median()

28.0

In [8]:
# Filling missing values in the 'Age' column with the median age
fill_titan = titan['Age'].fillna(28, inplace = True)
fill_titan

In [9]:
# Extracting dependent variables
dependent = titan["Survived"]
dependent

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [10]:
# Extracting independent variables
independent = titan.drop(['Survived'], axis = 'columns')
independent

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.2500
1,1,female,38.0,71.2833
2,3,female,26.0,7.9250
3,1,female,35.0,53.1000
4,3,male,35.0,8.0500
...,...,...,...,...
886,2,male,27.0,13.0000
887,1,female,19.0,30.0000
888,3,female,28.0,23.4500
889,1,male,26.0,30.0000


In [11]:
# Encoding categorical variable 'Sex' using LabelEncoder
le_sex = LabelEncoder()
le_sex

In [12]:
# fit transforming the variable 'Sex'
independent['le_sex_n'] = le_sex.fit_transform(independent['Sex'])
independent['le_sex_n']

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: le_sex_n, Length: 891, dtype: int64

In [13]:
independent

Unnamed: 0,Pclass,Sex,Age,Fare,le_sex_n
0,3,male,22.0,7.2500,1
1,1,female,38.0,71.2833,0
2,3,female,26.0,7.9250,0
3,1,female,35.0,53.1000,0
4,3,male,35.0,8.0500,1
...,...,...,...,...,...
886,2,male,27.0,13.0000,1
887,1,female,19.0,30.0000,0
888,3,female,28.0,23.4500,0
889,1,male,26.0,30.0000,1


In [14]:
# Dropping the original 'Sex' column and keeping the encoded one
real_independent = independent.drop(["Sex"], axis = 'columns')
real_independent

Unnamed: 0,Pclass,Age,Fare,le_sex_n
0,3,22.0,7.2500,1
1,1,38.0,71.2833,0
2,3,26.0,7.9250,0
3,1,35.0,53.1000,0
4,3,35.0,8.0500,1
...,...,...,...,...
886,2,27.0,13.0000,1
887,1,19.0,30.0000,0
888,3,28.0,23.4500,0
889,1,26.0,30.0000,1


In [15]:
# Splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(real_independent, dependent, test_size= 0.2)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

712
179
712
179


In [16]:
# Creating a decision tree classifier model
model = tree.DecisionTreeClassifier()
model

In [17]:
# Fitting the model on the training data
model.fit(real_independent, dependent)

In [18]:
# Evaluating the model's accuracy on the entire dataset
model.score(real_independent, dependent)

0.9775533108866442

In [19]:
# Making predictions using the trained model
model.predict([[1,50,10,0]])



array([1])

In [20]:
# Making predictions using the trained model
model.predict([[3,25,1000,1]])



array([0])

In [21]:
# Making predictions using the trained model
model.predict([[2,80,100,1,]])



array([0])

In [22]:
real_independent

Unnamed: 0,Pclass,Age,Fare,le_sex_n
0,3,22.0,7.2500,1
1,1,38.0,71.2833,0
2,3,26.0,7.9250,0
3,1,35.0,53.1000,0
4,3,35.0,8.0500,1
...,...,...,...,...
886,2,27.0,13.0000,1
887,1,19.0,30.0000,0
888,3,28.0,23.4500,0
889,1,26.0,30.0000,1
