<a href="https://colab.research.google.com/github/VishalMaheshwaran/MLDA-/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("student_exam_scores.csv", encoding="latin1")

# Display the first few rows of the dataset
print(data.head())

# Drop student_id (not useful for ML)
data = data.drop(columns=["student_id"])

# Numeric features (all remaining except target)
num_features = ["hours_studied", "sleep_hours", "attendance_percent", "previous_scores"]

# Numeric transformer
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Preprocessor (only numeric)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features)
    ]
)
preprocessor.set_output(transform="pandas")

# Apply transformations to features
X = preprocessor.fit_transform(data)

# Output (target) → exam_score
y = data[["exam_score"]]

print("Processed Features:")
print(X.head())

print("Target Variable:")
print(y.head())

# 2. Feature Engineering example: efficiency score (previous vs current)
X["efficiency_score"] = data["exam_score"] / (data["previous_scores"] + 1)

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display processed training data
print("Training Features:")
print(X_train.head())
print("Training Target:")
print(y_train.head())

  student_id  hours_studied  sleep_hours  attendance_percent  previous_scores  \
0       S001            8.0          8.8                72.1               45   
1       S002            1.3          8.6                60.7               55   
2       S003            4.0          8.2                73.7               86   
3       S004            3.5          4.8                95.1               66   
4       S005            9.1          6.4                89.8               71   

   exam_score  
0        30.2  
1        25.0  
2        35.8  
3        34.0  
4        40.3  
Processed Features:
   num__hours_studied  num__sleep_hours  num__attendance_percent  \
0            0.520154          1.458426                -0.192061   
1           -1.561083          1.324503                -0.994074   
2           -0.722376          1.056656                -0.079498   
3           -0.877692         -1.220042                 1.426035   
4            0.861850         -0.148655                 1