<a href="https://colab.research.google.com/github/NishchalSingh-Sudo/DataScienceEcosystem/blob/main/Dummy_Data_Preprocessing_Pipeline_and_Correction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

# Create a dummy dataset
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert the dictionary to a pandas DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first few rows of the dummy dataset
df_dummy.head()

Unnamed: 0,Feature1,Feature2,Category,Target
0,117.640523,32,A,1
1,104.001572,70,B,1
2,109.78738,85,C,0
3,122.408932,31,D,1
4,118.67558,13,A,0


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scipy import stats

# 1. Load the data
def load_data(df):
    return df.copy()

# 2. Handle missing values
def handle_missing_values(df):
    return df.apply(lambda col: col.fillna(col.mean()) if col.dtype != 'object' else col.fillna(col.mode()[0]))

# 3. Remove outliers using Z-score
def remove_outliers(df, z_thresh=3):
    df_copy = df.copy()
    numeric_cols = df_copy.select_dtypes(include='number').columns.drop('Target', errors='ignore')  # exclude Target
    z_scores = np.abs(stats.zscore(df_copy[numeric_cols]))
    mask = (z_scores < z_thresh).all(axis=1)
    return df_copy[mask]

# 4. Scale numeric features (excluding Target)
def scale_data(df):
    df_copy = df.copy()
    numeric_cols = df_copy.select_dtypes(include='number').columns.drop('Target', errors='ignore')
    scaler = MinMaxScaler()
    df_copy[numeric_cols] = scaler.fit_transform(df_copy[numeric_cols])
    return df_copy

# 5. Encode categorical variables
def encode_categorical(df, categorical_cols):
    df_copy = df.copy()
    for col in categorical_cols:
        le = LabelEncoder()
        df_copy[col] = le.fit_transform(df_copy[col])
    return df_copy

# ====== Pipeline Execution ======

# Assuming df_dummy is your original DataFrame
df_preprocessed = load_data(df_dummy)
df_preprocessed = handle_missing_values(df_preprocessed)
df_preprocessed = remove_outliers(df_preprocessed)
df_preprocessed = scale_data(df_preprocessed)
df_preprocessed = encode_categorical(df_preprocessed, ['Category'])

# Show result
print(df_preprocessed.head())


   Feature1  Feature2  Category  Target
0  0.895142  0.329897         0       1
1  0.612337  0.721649         1       1
2  0.732307  0.876289         2       0
3  0.994016  0.319588         3       1
4  0.916604  0.134021         0       0


In [8]:
# Save the cleaned and preprocessed DataFrame to a CSV file
save_data(df_preprocessed, 'preprocessed_dummy_data.csv')

print('Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv')

Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv


In [9]:
print(df_preprocessed.isnull().sum())

Feature1    0
Feature2    0
Category    0
Target      0
dtype: int64


In [10]:
print(df_preprocessed.head())

   Feature1  Feature2  Category  Target
0  0.895142  0.329897         0       1
1  0.612337  0.721649         1       1
2  0.732307  0.876289         2       0
3  0.994016  0.319588         3       1
4  0.916604  0.134021         0       0


In [11]:
print(df_preprocessed.describe())

         Feature1    Feature2    Category      Target
count  101.000000  101.000000  101.000000  101.000000
mean     0.541968    0.474533    1.485149    0.534653
std      0.208995    0.279868    1.127953    0.501285
min      0.000000    0.000000    0.000000    0.000000
25%      0.397837    0.288660    0.000000    0.000000
50%      0.554594    0.422680    1.000000    1.000000
75%      0.680542    0.711340    2.000000    1.000000
max      1.000000    1.000000    3.000000    1.000000


In [12]:
print(df_preprocessed.columns)

Index(['Feature1', 'Feature2', 'Category', 'Target'], dtype='object')
