In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load the dataset into the Python environment
titanic_df = pd.read_csv('path/to/your/titanic_dataset.csv')

# 2. Make 'PassengerId' as the index column
titanic_df.set_index('PassengerId', inplace=True)

# 3. Check the basic details of the dataset
print("Basic details of the dataset:")
print(titanic_df.info())
print("\nSummary statistics:")
print(titanic_df.describe())

# 4. Fill in all the missing values present in all the columns in the dataset
imputer = SimpleImputer(strategy='mean')  # You can choose another strategy if needed
titanic_df_filled = pd.DataFrame(imputer.fit_transform(titanic_df), columns=titanic_df.columns)

# 5. Check and handle outliers in at least 3 columns in the dataset
columns_with_outliers = ['Age', 'Fare', 'SibSp']

# Visualize outliers using boxplots
plt.figure(figsize=(14, 8))
for i, column in enumerate(columns_with_outliers, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x=titanic_df_filled[column])
    plt.title(f'Boxplot for {column}')

plt.tight_layout()
plt.show()

# Handle outliers (you can choose another method based on your analysis)
for column in columns_with_outliers:
    q1 = titanic_df_filled[column].quantile(0.25)
    q3 = titanic_df_filled[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    titanic_df_filled[column] = titanic_df_filled[column].apply(
        lambda x: lower_bound if x < lower_bound else (upper_bound if x > upper_bound else x)
    )

# Visualize boxplots after handling outliers
plt.figure(figsize=(14, 8))
for i, column in enumerate(columns_with_outliers, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x=titanic_df_filled[column])
    plt.title(f'Boxplot for {column}')

plt.tight_layout()
plt.show()

# 6. Do Min-Max scaling on the feature set (Take 'Survived' as target)
scaler = MinMaxScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(titanic_df_filled.drop('Survived', axis=1)),
                               columns=titanic_df_filled.drop('Survived', axis=1).columns)
titanic_scaled = pd.concat([features_scaled, titanic_df_filled['Survived']], axis=1)

# Display the first few rows of the scaled dataset
print("\nScaled Dataset:")
print(titanic_scaled.head())


FileNotFoundError: ignored