In [1]:
# Step 1: Import Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Step 2: Data Loading
# Load the Titanic dataset into a DataFrame
df = pd.read_csv('C:/Users/CPT/Downloads/train.csv')

# Step 3: Data Cleaning
# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Handle missing values
# Fill missing 'Age' values with the median age
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing 'Embarked' values with the most common embarkation point
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' column as it has too many missing values
df.drop(columns=['Cabin'], inplace=True)

# Convert 'Survived' and 'Pclass'to int data types and 'Embarked' to categorical data types
df['Survived'] = df['Survived'].astype(int)
df['Pclass'] = df['Pclass'].astype(int)
df['Embarked'] = df['Embarked'].astype('category')                                                   


# Verify data types
print("\nData Types:\n", df.dtypes)

# Step 4: Exploratory Data Analysis
# Analyze survival rates based on gender
survival_by_gender = df.groupby('Sex')['Survived'].mean()
print("\nSurvival Rate by Gender:\n", survival_by_gender)

# Analyze survival rates based on class
survival_by_class = df.groupby('Pclass')['Survived'].mean()
print("\nSurvival Rate by Class:\n", survival_by_class)

# Analyze survival rates based on age
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Senior'])
survival_by_age_group = df.groupby('AgeGroup')['Survived'].mean()
print("\nSurvival Rate by Age Group:\n", survival_by_age_group)

# Step 5: Visualization
# Set the Seaborn theme
sns.set_theme(style="whitegrid")

# Bar plot of survival rate by gender
plt.figure(figsize=(10, 5))
sns.barplot(x='Sex', y='Survived', data=df)
plt.title('Survival Rate by Gender')
plt.show()

# Bar plot of survival rate by class
plt.figure(figsize=(10, 5))
sns.barplot(x='Pclass', y='Survived', data=df)
plt.title('Survival Rate by Passenger Class')
plt.show()

# Bar plot of survival rate by age group
plt.figure(figsize=(10, 5))
sns.barplot(x='AgeGroup', y='Survived', data=df)
plt.title('Survival Rate by Age Group')
plt.show()

# Histogram of age distribution
plt.figure(figsize=(10, 5))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution of Passengers')
plt.show()

# Pie chart of survival rate
plt.figure(figsize=(7, 7))
df['Survived'].value_counts().plot.pie(autopct='%1.1f%%', colors=['lightblue', 'orange'], labels=['Did Not Survive', 'Survived'])
plt.title('Overall Survival Rate')
plt.ylabel('')
plt.show()

# Step 6: Documentation
# Summary of key insights
print("\nSummary of Key Insights:")
print("- Women had a significantly higher survival rate compared to men.")
print("- Passengers in first class had a higher survival rate compared to those in second and third class.")
print("- Children and seniors had higher survival rates compared to adults and teenagers.")

# Optionally, you can save the DataFrame or visualizations to files, or save the notebook as a report.


ModuleNotFoundError: No module named 'pandas'

: 