## TASK 1

<p> Clean a dataset by removing missing values and outliers.

In [5]:
import zipfile
import pandas as pd
import os

# Set the path to the zip file
zip_path = 'titanic.zip'
extracted_path = './'

# Unzipping the file and extracting its contents
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Load the training and test datasets
train_data_path = os.path.join(extracted_path, 'train.csv')
test_data_path = os.path.join(extracted_path, 'test.csv')
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# Function to handle missing values and remove outliers
def clean_dataset(df, is_train=True):
    # Fill missing values in 'Age' with the median age
    df['Age'].fillna(df['Age'].median(), inplace=True)
    
    # Drop the 'Cabin' column due to many missing values
    df.drop(columns=['Cabin'], inplace=True)
    
    # Fill missing values in 'Embarked' with the mode (most frequent value)
    if is_train:
        df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Fill missing values in 'Fare' with the median (for test dataset)
    if not is_train:
        df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    # Function to remove outliers using IQR
    def remove_outliers(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
        return df
    
    # Remove outliers from 'Age' and 'Fare' if it's the training dataset
    if is_train:
        df = remove_outliers(df, 'Age')
        df = remove_outliers(df, 'Fare')
    
    return df

# Clean the training and test datasets
clean_train_df = clean_dataset(train_df, is_train=True)
clean_test_df = clean_dataset(test_df, is_train=False)

# Save the cleaned datasets to new CSV files
cleaned_train_data_path = os.path.join(extracted_path, 'cleaned_train.csv')
cleaned_test_data_path = os.path.join(extracted_path, 'cleaned_test.csv')
clean_train_df.to_csv(cleaned_train_data_path, index=False)
clean_test_df.to_csv(cleaned_test_data_path, index=False)

print(f"Cleaned training dataset saved to {cleaned_train_data_path}")
print(f"Cleaned test dataset saved to {cleaned_test_data_path}")


Cleaned training dataset saved to ./cleaned_train.csv
Cleaned test dataset saved to ./cleaned_test.csv


## TASK 2


<p> Calculate summary statistics (mean, median,mode, standard deviation) for a dataset



In [6]:
# Function to calculate summary statistics
def summary_statistics(df):
    stats = {}
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        stats[column] = {
            'mean': df[column].mean(),
            'median': df[column].median(),
            'mode': df[column].mode()[0],
            'std_dev': df[column].std()
        }
    return pd.DataFrame(stats).transpose()f

# Calculate summary statistics for the cleaned training and test datasets
train_summary_stats = summary_statistics(clean_train_df)
test_summary_stats = summary_statistics(clean_test_df)

# Display summary statistics
print("Summary Statistics for Cleaned Training Dataset:")
print(train_summary_stats)

print("\nSummary Statistics for Cleaned Test Dataset:")
print(test_summary_stats)

# Save summary statistics to CSV files
train_summary_stats_path = os.path.join(extracted_path, 'train_summary_statistics.csv')
test_summary_stats_path = os.path.join(extracted_path, 'test_summary_statistics.csv')
train_summary_stats.to_csv(train_summary_stats_path)
test_summary_stats.to_csv(test_summary_stats_path)

print(f"Summary statistics for training dataset saved to {train_summary_stats_path}")
print(f"Summary statistics for test dataset saved to {test_summary_stats_path}")


Summary Statistics for Cleaned Training Dataset:
                   mean  median   mode     std_dev
PassengerId  447.883008  447.50   1.00  259.549777
Survived       0.334262    0.00   0.00    0.472061
Pclass         2.512535    3.00   3.00    0.715818
Age           28.079387   28.00  28.00   10.012369
SibSp          0.413649    0.00   0.00    0.855001
Parch          0.317549    0.00   0.00    0.784792
Fare          17.193204   11.75   8.05   13.244507

Summary Statistics for Cleaned Test Dataset:
                    mean     median    mode     std_dev
PassengerId  1100.500000  1100.5000  892.00  120.810458
Pclass          2.265550     3.0000    3.00    0.841838
Age            29.599282    27.0000   27.00   12.703770
SibSp           0.447368     0.0000    0.00    0.896760
Parch           0.392344     0.0000    0.00    0.981429
Fare           35.576535    14.4542    7.75   55.850103
Summary statistics for training dataset saved to ./train_summary_statistics.csv
Summary statistics for te