In [None]:
import pandas as pd #pandas is a library
import matplotlib.pyplot as plt # display purpose
import numpy as np # calculations
from google.colab import files #work with external file
uploaded=files.upload()

Saving titanic.csv to titanic.csv


In [None]:
df=pd.read_csv("titanic.csv")#read csv file into pandas dataframe called df
df.head()#display first few raw of dataframe

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df.drop(['Name','PassengerId','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis='columns',inplace=True)
df.head()
#drop() use to drop the specific label from raw or columns
#axis='columns' specify you drop  columns
#inplace=True mean the change will made directly to the original dataframe(df)without creating new dataframe

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


In [None]:
inputs=df.drop('Survived',axis='columns') #This line is dropping the column named 'Survived' from the DataFrame df.
# The axis='columns' parameter specifies that we are dropping a column, as opposed to a row (axis='rows').

target=df.Survived #This line is selecting the column 'Survived' from the DataFrame df and assigning it to the variable target. This suggests that 'Survived' is the target variable,
# often indicating whether a person survived in a survival prediction problem (such as in the context of Titanic dataset).
inputs #call inputs

Unnamed: 0,Pclass,Sex,Age
0,3,male,22.0
1,1,female,38.0
2,3,female,26.0
3,1,female,35.0
4,3,male,35.0
...,...,...,...
886,2,male,27.0
887,1,female,19.0
888,3,female,
889,1,male,26.0


In [None]:
dummies=pd.get_dummies(inputs.Sex) #creates dummy variables for the 'Sex' column in the DataFrame inputs and stores them in a new DataFrame called dummies.
# Each row in dummies represents a corresponding row in the 'Sex' column, with binary values indicating whether the individual is male or female
dummies.head(30)#first 30 rows of the dummies DataFrame

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
5,0,1
6,0,1
7,0,1
8,1,0
9,1,0


In [None]:
inputs=pd.concat([inputs,dummies],axis='columns') #concatenates the DataFrame inputs with the DataFrame dummies along the columns axis, effectively adding
# encoded 'Sex' dummy variables to the original inputs DataFrame. The result is stored back in the variable inputs
inputs.head(3) #irst 3 rows of the modified inputs DataFrame.

Unnamed: 0,Pclass,Sex,Age,female,male
0,3,male,22.0,0,1
1,1,female,38.0,1,0
2,3,female,26.0,1,0


In [None]:
inputs.drop(['Sex','male'],axis='columns',inplace=True) #drops the columns 'Sex' and 'male' from the DataFrame inputs along the columns axis (axis=1) and updates inputs in place
inputs.head(3)

Unnamed: 0,Pclass,Age,female
0,3,22.0,0
1,1,38.0,1
2,3,26.0,1


In [None]:
inputs.columns[inputs.isna().any()] # checks each column for NaN values using .isna() and then checks if any NaNs exist within each column using .any().eturns the column names where at least one NaN value is found.

Index(['Age'], dtype='object')

In [None]:
inputs['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [None]:
inputs['Age']=df['Age'].fillna((df['Age'].median())) # It replaces NaN values in the 'Age' column with the median age using the .fillna() method.

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(inputs,target,test_size=0.3)

In [None]:
X_train #DataFrame or array that includes all the features (independent variables) used to train the model, but without the target variable.

Unnamed: 0,Pclass,Age,female
820,1,52.0,1
157,3,30.0,0
733,2,23.0,0
162,3,26.0,0
1,1,38.0,1
...,...,...,...
261,3,3.0,0
117,2,29.0,0
129,3,45.0,0
328,3,31.0,1


In [None]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
#imports the Gaussian Naive Bayes classifier from the scikit-learn library (sklearn.naive_bayes) and creates an instance of it called model
#Gaussian Naive Bayes is a type of Naive Bayes classifier that assumes continuous input features and fits a Gaussian distribution to each feature

In [None]:
model.fit(X_train,y_train)# trains the machine learning model (model) using the training data (X_train) and their corresponding target labels (y_train)
#.fit() function fits the model to the provided training data, allowing it to learn patterns and relationships between the features and the target variable in order to make predictions.

In [None]:
model.score(X_test,y_test) # It calculates the accuracy score by comparing the model's predictions on the testing features (X_test) with the actual labels (y_test)
#score() function returns the accuracy score

0.7761194029850746

In [None]:
X_test[0:10] # first 10 rows of the testing data stored in the DataFrame or array X_test

Unnamed: 0,Pclass,Age,female
843,3,34.5,0
571,1,53.0,1
477,3,29.0,0
124,1,54.0,0
451,3,28.0,0
788,3,1.0,0
635,2,28.0,1
566,3,19.0,0
607,1,27.0,0
454,3,28.0,0


In [None]:
y_train[0:10] # first 10 target labels from the training data stored in the variable y_train

820    1
157    0
733    0
162    0
1      1
666    0
150    0
780    1
62     0
97     1
Name: Survived, dtype: int64

In [None]:
model.predict(X_test[0:10]) #uses the trained machine learning model (model) to predict the target labels for the first 10 samples of the testing data (X_test[0:10])

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0])