In [None]:
# Step 1: Setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 2: Load the Data
train_df = pd.read_csv(r'C:\Users\LENOVO\OneDrive\Desktop\prodigy\task2\train.csv')
test_df = pd.read_csv(r'C:\Users\LENOVO\OneDrive\Desktop\prodigy\task2\test.csv')
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Step 3: Explore the Data
print(combined_df.info())
print(combined_df.describe())
print(combined_df.isnull().sum())

# Step 4: Data Cleaning
combined_df['Age'].fillna(combined_df['Age'].median(), inplace=True)
combined_df['Embarked'].fillna(combined_df['Embarked'].mode()[0], inplace=True)
combined_df['Fare'].fillna(combined_df['Fare'].median(), inplace=True)
combined_df['Cabin'].fillna('Unknown', inplace=True)
combined_df['Deck'] = combined_df['Cabin'].str[0]

combined_df['FamilySize'] = combined_df['SibSp'] + combined_df['Parch'] + 1
combined_df['IsAlone'] = (combined_df['FamilySize'] == 1).astype(int)
combined_df['Title'] = combined_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Step 5: Exploratory Data Analysis (EDA)
sns.countplot(x='Survived', hue='Sex', data=train_df)
plt.show()

sns.countplot(x='Pclass', hue='Survived', data=train_df.astype({'Survived': str}))
plt.show()

sns.boxplot(x='Pclass', y='Age', data=train_df)
plt.show()

sns.countplot(x='Survived', hue='Embarked', data=train_df)
plt.show()

print(train_df.groupby('Sex')['Survived'].mean())
print(train_df.groupby('Pclass')['Survived'].mean())

sns.histplot(combined_df['Age'], bins=20)
plt.show()

sns.histplot(combined_df['Fare'], bins=20)
plt.show()

# Step 6: Calculate Correlation Matrix for Numeric Columns Only
numeric_cols = combined_df.select_dtypes(include=np.number).columns
corr_matrix = combined_df[numeric_cols].corr()

# Plot the heatmap
sns.heatmap(corr_matrix.replace([np.inf, -np.inf], np.nan), annot=True, cmap='coolwarm')
plt.show()

# Step 7: Save Cleaned Data (Optional)
combined_df.to_csv('cleaned_titanic_data.csv', index=False)
