In [None]:
# Experiment 3

# Perform the following data pre-processing tasks using python libraries
# ●	Mounting the Drive to Google Colab
# ●	Importing the libraries (Numpy, Pandas, Matplotlib, Scikit-learn) and Import the dataset using various functions like Manual, loadtxt, read_csv, genfromtxt, pickle
# ●	Conversion of various data files (PDF, Excel, csv, xlsx, xls).
# ●	Identifying the dependent and independent variables
# ●	Dealing with missing data using various Traditional imputation methods

# Assignment- Dealing with missing data using various Advanced imputation methods


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files
uploaded = files.upload()
print(uploaded)

Saving DMDW_titanic_data.csv to DMDW_titanic_data.csv
{'DMDW_titanic_data.csv': b'survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone\n0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False\n1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False\n1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True\n1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False\n0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True\n0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True\n0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True\n0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False\n1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False\n1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False\n1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False\n1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True\n0,3,male

In [None]:
import pandas as pd
df = pd.read_csv('DMDW_titanic_data.csv')

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# ==============================================================================
# The file path is correct and the file is loading successfully.
# ==============================================================================
file_path = "/content/drive/My Drive/DMDW_titanic_data.csv"
# ==============================================================================

try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded data from: {file_path}")
except Exception as e:
    print(f"--- ERROR ---")
    print(f"Could not load the file. Please check the path. Error: {e}")
    exit()


# --- Initial Inspection, Other Imports, and File Conversion (as before) ---
print("\n--- First 5 rows of the dataset ---")
print(df.head())
print("\n--- Missing Values Count ---")
print(df.isnull().sum())
# ... (The rest of the initial steps are omitted for brevity but are included in the logic)


# --- Task: Identifying the dependent and independent variables ---
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']].copy()
y = df['survived'].copy()
print("\n\n--- Identifying Variables ---")
print("--- Independent Variables (X) - first 5 rows ---")
print(X.head())


# --- Task: Dealing with missing data (Traditional imputation) ---
print(f"\n\n--- Traditional Imputation ---")
print(f"Missing values before imputation:\n{X.isnull().sum()}")

# Impute categorical column 'embarked' with the mode
mode_imputer = SimpleImputer(strategy='most_frequent')
# CORRECTED LINE: Added .ravel() to flatten the output
X['embarked'] = mode_imputer.fit_transform(X[['embarked']]).ravel()

# Impute numerical column 'age' with the mean
mean_imputer = SimpleImputer(strategy='mean')
# CORRECTED LINE: Added .ravel() to flatten the output
X['age'] = mean_imputer.fit_transform(X[['age']]).ravel()

print("\n--- Missing values after traditional imputation ---")
print(X.isnull().sum())


# --- Assignment: Dealing with missing data (Advanced imputation) ---
print(f"\n\n--- Advanced Imputation ---")
# Reload numerical data to have missing values again
X_advanced = df[['pclass', 'age', 'sibsp', 'parch', 'fare']].copy()

# a) KNN Imputation
print(f"--- KNN Imputation ---")
# This part is fine as it creates a new DataFrame, so no correction is needed.
knn_imputer = KNNImputer(n_neighbors=5)
X_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(X_advanced), columns=X_advanced.columns)
print(f"Missing 'age' values after KNN imputation: {X_knn_imputed['age'].isnull().sum()}")


# b) Iterative Imputation (MICE)
print(f"\n--- Iterative Imputation (MICE) ---")
X_advanced = df[['pclass', 'age', 'sibsp', 'parch', 'fare']].copy() # Reset data
# This part is also fine as it creates a new DataFrame.
iterative_imputer = IterativeImputer(max_iter=10, random_state=0)
X_iterative_imputed = pd.DataFrame(iterative_imputer.fit_transform(X_advanced), columns=X_advanced.columns)
print(f"Missing 'age' values after Iterative imputation: {X_iterative_imputed['age'].isnull().sum()}")

Successfully loaded data from: /content/drive/My Drive/DMDW_titanic_data.csv

--- First 5 rows of the dataset ---
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

--- Missing Values Count ---
survived         0
pclass           0
sex              0
age            177

In [7]:
# Add this line at the end of your code cell
display(X)

# Add this line at the end of your code cell
display(X_knn_imputed)

# Add this line at the end of your code cell
display(X_iterative_imputed)

# Recommended line for a clean preview
print("Final DataFrame after Iterative Imputation (MICE):")
display(X_iterative_imputed.head())

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3,male,22.000000,1,0,7.2500,S
1,1,female,38.000000,1,0,71.2833,C
2,3,female,26.000000,0,0,7.9250,S
3,1,female,35.000000,1,0,53.1000,S
4,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.000000,0,0,13.0000,S
887,1,female,19.000000,0,0,30.0000,S
888,3,female,29.699118,1,2,23.4500,S
889,1,male,26.000000,0,0,30.0000,C


Unnamed: 0,pclass,age,sibsp,parch,fare
0,3.0,22.0,1.0,0.0,7.2500
1,1.0,38.0,1.0,0.0,71.2833
2,3.0,26.0,0.0,0.0,7.9250
3,1.0,35.0,1.0,0.0,53.1000
4,3.0,35.0,0.0,0.0,8.0500
...,...,...,...,...,...
886,2.0,27.0,0.0,0.0,13.0000
887,1.0,19.0,0.0,0.0,30.0000
888,3.0,26.8,1.0,2.0,23.4500
889,1.0,26.0,0.0,0.0,30.0000


Unnamed: 0,pclass,age,sibsp,parch,fare
0,3.0,22.000000,1.0,0.0,7.2500
1,1.0,38.000000,1.0,0.0,71.2833
2,3.0,26.000000,0.0,0.0,7.9250
3,1.0,35.000000,1.0,0.0,53.1000
4,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...
886,2.0,27.000000,0.0,0.0,13.0000
887,1.0,19.000000,0.0,0.0,30.0000
888,3.0,21.132653,1.0,2.0,23.4500
889,1.0,26.000000,0.0,0.0,30.0000


Final DataFrame after Iterative Imputation (MICE):


Unnamed: 0,pclass,age,sibsp,parch,fare
0,3.0,22.0,1.0,0.0,7.25
1,1.0,38.0,1.0,0.0,71.2833
2,3.0,26.0,0.0,0.0,7.925
3,1.0,35.0,1.0,0.0,53.1
4,3.0,35.0,0.0,0.0,8.05



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

