In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

In [15]:
# Load Titanic dataset from seaborn's built-in repository
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [16]:
# Explanation of each column in the Titanic dataset
column_descriptions = {
    'survived': 'Survival (0 = No, 1 = Yes)',
    'pclass': 'Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)',
    'sex': 'Sex of the passenger',
    'age': 'Age of the passenger in years',
    'sibsp': 'Number of siblings/spouses aboard the Titanic',
    'parch': 'Number of parents/children aboard the Titanic',
    'fare': 'Passenger fare (British pound)',
    'embarked': 'Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)',
    'class': 'Passenger class (First, Second, Third)',
    'who': 'Whether the passenger is a man, woman, or child',
    'adult_male': 'True if the passenger is an adult male, False otherwise',
    'deck': 'Deck where the passenger was located',
    'embark_town': 'Town of embarkation',
    'alive': 'Survival status (yes or no)',
    'alone': 'True if the passenger was alone, False otherwise'
}

for col, desc in column_descriptions.items():
    print(f"{col}: {desc}")

survived: Survival (0 = No, 1 = Yes)
pclass: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
sex: Sex of the passenger
age: Age of the passenger in years
sibsp: Number of siblings/spouses aboard the Titanic
parch: Number of parents/children aboard the Titanic
fare: Passenger fare (British pound)
embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
class: Passenger class (First, Second, Third)
who: Whether the passenger is a man, woman, or child
adult_male: True if the passenger is an adult male, False otherwise
deck: Deck where the passenger was located
embark_town: Town of embarkation
alive: Survival status (yes or no)
alone: True if the passenger was alone, False otherwise


In [None]:
# Remove duplicate rows
titanic_cleaned = titanic.drop_duplicates()

# Optionally, drop rows with missing values in key columns (e.g., 'age', 'embarked', 'deck')
titanic_cleaned = titanic_cleaned.dropna(subset=['age', 'embarked', 'deck'])

# Reset index after cleaning
titanic_cleaned = titanic_cleaned.reset_index(drop=True)

# Display the first few rows of the cleaned dataset
titanic_cleaned.head()

# Basic understanding of the dataset
print("Number of rows:", titanic_cleaned.shape[0])
print("Number of columns:", titanic_cleaned.shape[1])
print("\nColumn names:", titanic_cleaned.columns.tolist())
print("\nData types:\n", titanic_cleaned.dtypes)
print("\nMissing values per column:\n", titanic_cleaned.isnull().sum())
print("\nSummary statistics:\n", titanic_cleaned.describe(include='all'))

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
1,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
2,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
3,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
4,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [None]:
# Clean the Titanic dataset in Python (if not already done)
titanic_cleaned = titanic.drop_duplicates()
titanic_cleaned = titanic_cleaned.dropna(subset=['age', 'embarked', 'deck'])
titanic_cleaned = titanic_cleaned.reset_index(drop=True)

# First step of EDA: Get a general overview of the cleaned Titanic dataset

# Display the first few rows
display(titanic_cleaned.head())

# Show basic info about the dataset
print("\nDataset Info:")
titanic_cleaned.info()

# Show summary statistics for numerical columns
print("\nSummary Statistics (Numerical):")
display(titanic_cleaned.describe())

# Show summary statistics for categorical/object columns
print("\nSummary Statistics (Categorical):")
display(titanic_cleaned.describe(include=['object', 'category', 'bool']))

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
1,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
2,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
3,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
4,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     181 non-null    int64   
 1   pclass       181 non-null    int64   
 2   sex          181 non-null    object  
 3   age          181 non-null    float64 
 4   sibsp        181 non-null    int64   
 5   parch        181 non-null    int64   
 6   fare         181 non-null    float64 
 7   embarked     181 non-null    object  
 8   class        181 non-null    category
 9   who          181 non-null    object  
 10  adult_male   181 non-null    bool    
 11  deck         181 non-null    category
 12  embark_town  181 non-null    object  
 13  alive        181 non-null    object  
 14  alone        181 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 16.9+ KB

Summary Statistics (Numerical):


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,181.0,181.0,181.0,181.0,181.0,181.0
mean,0.674033,1.19337,35.687403,0.469613,0.480663,78.972883
std,0.470035,0.517644,15.691057,0.645854,0.757123,76.699584
min,0.0,1.0,0.92,0.0,0.0,0.0
25%,0.0,1.0,24.0,0.0,0.0,29.7
50%,1.0,1.0,36.0,0.0,0.0,57.0
75%,1.0,1.0,48.0,1.0,1.0,90.0
max,1.0,3.0,80.0,3.0,4.0,512.3292



Summary Statistics (Categorical):


Unnamed: 0,sex,embarked,class,who,adult_male,deck,embark_town,alive,alone
count,181,181,181,181,181,181,181,181,181
unique,2,3,3,3,2,7,3,2,2
top,male,S,First,man,False,C,Southampton,yes,False
freq,94,115,156,87,94,51,115,122,104


In [None]:
# Visualize the distribution of survivors vs non-survivors
plt.figure(figsize=(6, 4))
sns.countplot(data=titanic_cleaned, x='survived', palette='Set2')
plt.title('Survival Count')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

# Visualize survival by passenger class
plt.figure(figsize=(8, 5))
sns.countplot(data=titanic_cleaned, x='pclass', hue='survived', palette='Set1')
plt.title('Survival by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

# Visualize age distribution by survival
plt.figure(figsize=(8, 5))
sns.histplot(data=titanic_cleaned, x='age', hue='survived', multiple='stack', bins=20, palette='Set2')
plt.title('Age Distribution by Survival')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()