**Import the relevant python libraries for the analysis**

In [1]:
import pandas as pd
import numpy as np

**Load the dataset from titanic.csv**

**Display the first 6 rows of the dataset**

In [2]:
titanic=pd.read_csv("/kaggle/input/titanic/titanic.csv")
df = pd.DataFrame(titanic)
df.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


**Display How many rows and columns in the dataset**

In [3]:
# Check the shape of the DataFrame
print("DataFrame shape:", df.shape)

DataFrame shape: (891, 12)


**What is the datatype of Pclass?**

In [4]:
# Verify the data type of the 'Pclass' column
print("Data type of 'Pclass':", type(df["Pclass"][0]))

Data type of 'Pclass': <class 'numpy.int64'>


**How many unique values in Embarked column?**

In [5]:
# Get the length of the 'Embarked' column
print("Length of 'Embarked':", len(df["Embarked"]))

Length of 'Embarked': 891


**Check if there is any missing value in the dataset**   
**Also, handle them with the most suitable option to each column or row**

In [6]:
# Check for missing values
print("Missing values summary:")
print(df.isnull().sum())

Missing values summary:
PassengerId      0
Survived         1
Pclass           0
Name             0
Sex              1
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          687
Embarked         3
dtype: int64


In [7]:
# Drop rows with missing values in 'Survived' column
df.dropna(subset=['Survived'], inplace=True)
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              1
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          686
Embarked         3
dtype: int64

In [8]:
# Drop rows with missing values in 'Sex' and 'Fare' columns
df.dropna(subset=['Sex', 'Fare'], inplace=True)

# Calculate the mean age
mean_age = df['Age'].mean()

# Fill missing age values with the mean age
df['Age'].fillna(mean_age, inplace=True)

# Drop the 'Cabin' column
df.drop('Cabin', axis=1, inplace=True)

# Check for remaining missing values
print("Remaining missing values summary:")
print(df.isnull().sum())


Remaining missing values summary:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       3
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(mean_age, inplace=True)


**Handle any wrong datatype or data values**

In [9]:
# Display the first 20 rows after cleaning
print("First 20 rows after cleaning:")
df.head(20)

First 20 rows after cleaning:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S
5,6,0.0,3,"Moran, Mr. James",male,29.644402,0,0,330877,8.4583,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C


In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 888 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  888 non-null    int64  
 1   Survived     888 non-null    float64
 2   Pclass       888 non-null    int64  
 3   Name         888 non-null    object 
 4   Sex          888 non-null    object 
 5   Age          888 non-null    float64
 6   SibSp        888 non-null    int64  
 7   Parch        888 non-null    int64  
 8   Ticket       888 non-null    object 
 9   Fare         888 non-null    float64
 10  Embarked     885 non-null    object 
dtypes: float64(3), int64(4), object(4)
memory usage: 83.2+ KB


In [11]:
df['Ticket'].unique()


array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
       '330877', '17463', '349909', '347742', '237736', 'PP 9549',
       'A/5. 2151', '350406', '248706', '382652', '244373', '2649',
       '239865', '248698', '330923', '113788', '347077', '2631', '19950',
       '330959', '349216', 'PC 17601', 'PC 17569', '335677', 'C.A. 24579',
       'PC 17604', '113789', '2677', 'A./5. 2152', '345764', '2651',
       '7546', '11668', '349253', 'SC/Paris 2123', '330958',
       'S.C./A.4. 23567', '370371', '14311', '2662', '349237', '3101295',
       'A/4. 39886', 'PC 17572', '2926', '113509', '19947', 'C.A. 31026',
       '2697', 'C.A. 34651', 'CA 2144', '2669', '113572', '36973',
       '347088', 'PC 17605', '2661', 'C.A. 29395', 'S.P. 3464', '3101281',
       '315151', 'C.A. 33111', 'S.O.C. 14879', '2680', '1601', '348123',
       '349208', '374746', '248738', '364516', '345767', '345779',
       '330932', '113059', 'SO/C 14885', '3101278', 'W./C. 6608',
       'SOTON/OQ 3

In [12]:
df.loc[0, 'Ticket'] = 21171
df.loc[1, 'Ticket'] = 17599
df.loc[2, 'Ticket'] = 3101282
df.loc[10, 'Ticket'] = 9549
df.loc[12, 'Ticket'] = 2151


In [13]:
# Convert 'Survived' column to boolean type
df["Survived"] = df["Survived"].astype(float).astype(bool)
df.head(15)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,21171,7.25,S
1,2,True,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,17599,71.2833,C
2,3,True,3,"Heikkinen, Miss. Laina",female,26.0,0,0,3101282,7.925,S
3,4,True,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,False,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S
5,6,False,3,"Moran, Mr. James",male,29.644402,0,0,330877,8.4583,Q
6,7,False,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S
7,8,False,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,S
8,9,True,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,S
9,10,True,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C


**What is the relation between Pclass and Age?**

In [14]:
# Calculate the correlation between 'Age' and 'Pclass'
print("Correlation between 'Age' and 'Pclass':", df["Age"].corr(df["Pclass"]))

Correlation between 'Age' and 'Pclass': -0.3302457925134826


**Is there any relation between Age and if someone will be Survived or not??**

In [15]:
# Calculate the correlation between 'Age' and 'Survived'
print("Correlation between 'Age' and 'Survived':", df["Age"].corr(df["Survived"]))

Correlation between 'Age' and 'Survived': -0.07235651336749364
