# Fourth Project

In [1]:
# 📦 Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils.validation import check_is_fitted, check_array

# 📥 Step 2: Load Dataset
df = pd.read_csv("/content/Titanic-Dataset.csv")  # Rename to your dataset file if different
print("Initial Shape:", df.shape)
display(df.head())

# 🧹 Step 3: Basic Data Cleaning
# Drop columns with too many missing values or irrelevant data
df.drop(columns=["Cabin", "Ticket", "Name", "PassengerId"], inplace=True)

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# 🎯 Step 4: Split Features and Target
X = df.drop("Survived", axis=1)
y = df["Survived"]

# Identify column types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# ⚙️ Step 5: Custom Label Encoder for ColumnTransformer
class LabelEncoderWrapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # X is expected to be a numpy array from ColumnTransformer
        # Fit LabelEncoder on the unique values of each column
        self.encoders_ = [LabelEncoder().fit(X[:, i]) for i in range(X.shape[1])]
        self.classes_ = [encoder.classes_ for encoder in self.encoders_]
        return self

    def transform(self, X):
        # X is expected to be a numpy array from ColumnTransformer
        # Transform each column using the fitted encoder
        X_transformed = np.zeros_like(X, dtype=float)
        for i in range(X.shape[1]):
            X_transformed[:, i] = self.encoders_[i].transform(X[:, i])
        return X_transformed

# 🧼 Step 6: Define Pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", LabelEncoderWrapper()) # Use the custom encoder here
])

# 🏗️ Step 7: Combine into ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

# 🔁 Step 8: Apply Preprocessing
X_cleaned = preprocessor.fit_transform(X)

# 🧾 Step 9: Create Final Cleaned DataFrame
# Get the feature names after preprocessing
processed_feature_names = num_cols + cat_cols

processed_df = pd.DataFrame(X_cleaned, columns=processed_feature_names)
processed_df["Survived"] = y.values

# 🖨️ Display Sample
print("\n✅ Final Cleaned Dataset:")
display(processed_df.head())

# 💾 Step 10: Save Cleaned Data
processed_df.to_csv("cleaned_titanic_data.csv", index=False)
print("\n📁 Cleaned data saved as 'cleaned_titanic_data.csv'")

Initial Shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Missing Values:
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

✅ Final Cleaned Dataset:


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked,Survived
0,0.827377,-0.592481,0.432793,-0.473674,-0.502445,1.0,2.0,0
1,-1.566107,0.638789,0.432793,-0.473674,0.786845,0.0,0.0,1
2,0.827377,-0.284663,-0.474545,-0.473674,-0.488854,0.0,2.0,1
3,-1.566107,0.407926,0.432793,-0.473674,0.42073,0.0,2.0,1
4,0.827377,0.407926,-0.474545,-0.473674,-0.486337,1.0,2.0,0



📁 Cleaned data saved as 'cleaned_titanic_data.csv'
