In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('titanic.csv')
print(df.head())
print(df.sample(5))

In [None]:
print(df.dtypes)
print(df.size)
print(df.shape)
print(df.columns)
print(df.describe())

In [None]:
print(df.isna().sum())

In [None]:
df['Age'].fillna(df['Age'].median(),inplace=True)
df['Cabin'].fillna(df['Cabin'].ffill(),inplace=True)
df['Cabin'].fillna(df['Cabin'].mode()[0],inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

In [None]:
df.columns

In [None]:
df = df.drop('PassengerId',axis=1)
df = df.drop('Name',axis=1)
df = df.drop('Ticket',axis=1)


In [None]:
df['Sex'] = df['Sex'].astype('category')
df['Sex'] = df['Sex'].cat.codes 

df['Embarked'] = df['Embarked'].astype('category')
df['Embarked'] = df['Embarked'].cat.codes 

In [None]:
df.sample(5)

In [None]:
# from sklearn.preprocessing import MaxAbsScaler
# df[['Age','Fare']] = MaxAbsScaler().fit_transform(df[['Age','Fare']])

In [None]:
# from sklearn.preprocessing import PowerTransformer
# df['Fare'] = PowerTransformer().fit_transform(df['Fare'].values.reshape(-1,1))

In [None]:
# One variable (Age and Fare)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.boxplot(ax=axes[0],data=df, y='Age')
axes[0].set_title('Boxplot of Age')

sns.boxplot(data=df, y='Fare', ax=axes[1])
axes[1].set_title('Boxplot of Fare')
axes[1].set_yscale('log')  # Set log scale for Fare
plt.tight_layout()
plt.show()

In [None]:
# Two variables (Age)
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
sns.boxplot(data=df, x='Sex', y='Age', hue='Sex', ax=axes[0])
axes[0].set_title('Age vs Sex')

sns.boxplot(data=df, x='Pclass', y='Age', hue='Pclass', ax=axes[1])
axes[1].set_title('Age vs Pclass')

sns.boxplot(data=df, x='Survived', y='Age', hue='Survived', ax=axes[2])
axes[2].set_title('Age vs Survived')

plt.tight_layout()
plt.show()

In [None]:
# Two variables (Fare)
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
sns.boxplot(data=df, x='Sex', y='Fare', hue='Sex', ax=axes[0])
axes[0].set_yscale('log')
axes[0].set_title('Fare vs Sex (Log Scale)')

sns.boxplot(data=df, x='Pclass', y='Fare', hue='Pclass', ax=axes[1])
axes[1].set_yscale('log')
axes[1].set_title('Fare vs Pclass (Log Scale)')

sns.boxplot(data=df, x='Survived', y='Fare', hue='Survived', ax=axes[2])
axes[2].set_yscale('log')
axes[2].set_title('Fare vs Survived (Log Scale)')

plt.tight_layout()
plt.show()

In [None]:
# Three variables (Age with Survived)
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)
sns.boxplot(data=df, x='Sex', y='Age', hue='Survived', ax=axes[0])
axes[0].set_title('Age vs Sex vs Survived')

sns.boxplot(data=df, x='Pclass', y='Age', hue='Survived', ax=axes[1])
axes[1].set_title('Age vs Pclass vs Survived')

plt.tight_layout()
plt.show()

In [None]:
# Three variables (Fare with Survived)
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)
sns.boxplot(data=df, x='Sex', y='Fare', hue='Survived', ax=axes[0])
axes[0].set_yscale('log')
axes[0].set_title('Fare vs Sex vs Survived (Log Scale)')

sns.boxplot(data=df, x='Pclass', y='Fare', hue='Survived', ax=axes[1])
axes[1].set_yscale('log')
axes[1].set_title('Fare vs Pclass vs Survived (Log Scale)')

plt.tight_layout()
plt.show()

In [None]:
import numpy as np  # Make sure to import numpy

def removeOutliers(df):
    # Create a new DataFrame to hold the cleaned data
    cleaned_df = df.copy()
    
    # Loop through each numeric column in the DataFrame
    for var in df.select_dtypes(include=[np.number]).columns:
        # Calculate Q1, Q3, and IQR for the column
        Q1 = df[var].quantile(0.25)
        Q3 = df[var].quantile(0.75)
        IQR = Q3 - Q1
        
        # Define the upper and lower bounds for outliers
        high = Q3 + 1.5 * IQR
        low = Q1 - 1.5 * IQR
        
        # Filter the column and keep only values within the bounds
        cleaned_df = cleaned_df[(cleaned_df[var] >= low) & (cleaned_df[var] <= high)]
    
    return cleaned_df


In [None]:
df1 = removeOutliers(df)
print(df1)

### Observations from All Graphs:

1. **Single Variable Boxplots:**
   - **Age:** A few outliers at older ages, with most ages concentrated in the 20-50 range.
   - **Fare:** A large range of fares, with high-value outliers indicating first-class passengers.

2. **Two Variable Boxplots (Age vs. Categories):**
   - **Sex:** Females tend to have a higher survival rate than males across different ages.
   - **Pclass:** First-class passengers (Pclass=1) are more likely to survive compared to second and third-class.
   - **Survived:** Survivors are younger, with a notable difference in age between those who survived vs. those who didn’t.

3. **Two Variable Boxplots (Fare vs. Categories):**
   - **Sex:** Females tend to pay lower fares, but still have higher survival rates.
   - **Pclass:** First-class passengers paid much higher fares, which correlates with higher survival rates.
   - **Survived:** Higher fares correlate with higher survival rates, particularly in first class.

4. **Three Variable Boxplots (Age, Fare vs. Categories and Survival):**
   - **Sex vs. Age & Survival:** Women, particularly younger women, had higher survival rates.
   - **Pclass vs. Age & Survival:** First-class passengers survived at a higher rate, especially in the younger age groups.
   - **Fare vs. Age & Survival:** Higher fare-paying passengers, especially in first-class, had a better chance of survival.

