<a href="https://colab.research.google.com/github/Uttamdevsharma/ML_ALL_MODEL/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np


In [2]:
# Sample dataset
data = {
    'ID': range(1, 11),
    'Age': [25, 30, np.nan, 22, 45, 30, 28, 50, np.nan, 35],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male'],
    'City': ['New York', 'London', 'Paris', 'New York', 'London', 'Paris', 'New York', 'London', 'Paris', 'New York'],
    'Income': [50000, 60000, 75000, 45000, 90000, 62000, 53000, 150000, 70000, np.nan],
    'Experience_Years': [3, 7, 2, 1, 10, 5, 4, 15, 6, 8],
    'Has_Degree': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes'],
    'Favorite_Color': ['Blue', 'Green', 'Red', 'blue', 'Green', 'RED', 'Blue', 'Green', 'Red', np.nan]
}

In [5]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,ID,Age,Gender,City,Income,Experience_Years,Has_Degree,Favorite_Color
0,1,25.0,Male,New York,50000.0,3,Yes,Blue
1,2,30.0,Female,London,60000.0,7,No,Green
2,3,,Male,Paris,75000.0,2,Yes,Red
3,4,22.0,Female,New York,45000.0,1,Yes,blue
4,5,45.0,Male,London,90000.0,10,No,Green


In [6]:
df['Favorite_Color'] = df['Favorite_Color'].str.lower()

In [7]:
df.head()

Unnamed: 0,ID,Age,Gender,City,Income,Experience_Years,Has_Degree,Favorite_Color
0,1,25.0,Male,New York,50000.0,3,Yes,blue
1,2,30.0,Female,London,60000.0,7,No,green
2,3,,Male,Paris,75000.0,2,Yes,red
3,4,22.0,Female,New York,45000.0,1,Yes,blue
4,5,45.0,Male,London,90000.0,10,No,green


In [8]:
df = df.drop(columns=['ID'])

In [9]:
df.head()

Unnamed: 0,Age,Gender,City,Income,Experience_Years,Has_Degree,Favorite_Color
0,25.0,Male,New York,50000.0,3,Yes,blue
1,30.0,Female,London,60000.0,7,No,green
2,,Male,Paris,75000.0,2,Yes,red
3,22.0,Female,New York,45000.0,1,Yes,blue
4,45.0,Male,London,90000.0,10,No,green


In [10]:
# Identify feature types
numeric_features = ['Age', 'Income', 'Experience_Years']
categorical_features = ['Gender', 'City', 'Has_Degree', 'Favorite_Color']

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [12]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
from sklearn.compose import ColumnTransformer

In [16]:
transformer=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
]

In [22]:
from sklearn.compose import ColumnTransformer

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])



In [25]:
# Apply transformations
X_processed = preprocessor.fit_transform(df)

# Extract final feature names
ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_feature_names = list(numeric_features) + list(ohe_feature_names)

# Convert to DataFrame
X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names)
print("\n−−− Fully Preprocessed Data (Features) −−−")
print(X_processed_df)


−−− Fully Preprocessed Data (Features) −−−
        Age    Income  Experience_Years  Gender_Female  Gender_Male  \
0 -0.909174 -0.751495         -0.782619            0.0          1.0   
1 -0.303058 -0.405184          0.227212            1.0          0.0   
2 -0.303058  0.114283         -1.035076            0.0          1.0   
3 -1.272844 -0.924651         -1.287534            1.0          0.0   
4  1.515291  0.633749          0.984585            0.0          1.0   
5 -0.303058 -0.335922         -0.277703            1.0          0.0   
6 -0.545505 -0.647602         -0.530161            0.0          1.0   
7  2.121407  2.711616          2.246873            1.0          0.0   
8 -0.303058 -0.058873         -0.025246            1.0          0.0   
9  0.303058 -0.335922          0.479670            0.0          1.0   

   City_London  City_New York  City_Paris  Has_Degree_No  Has_Degree_Yes  \
0          0.0            1.0         0.0            0.0             1.0   
1          1.0        

In [27]:
from sklearn.preprocessing import LabelEncoder

# Define the target variable 'y' (assuming 'Has_Degree' is the target)
y = df['Has_Degree']

# Encode target if needed
label_encoder = LabelEncoder()
y_processed = label_encoder.fit_transform(y)

print("\n−−− Processed Target Variable −−−")
print(y_processed)
print(f"Classes: {label_encoder.classes_}")


−−− Processed Target Variable −−−
[1 0 1 1 0 1 1 1 0 1]
Classes: ['No' 'Yes']
