In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r'C:\Users\Hp\Desktop\work\machine-learning-assignment-1-group-2\student_success_dataset.csv')

# Preview the dataset
df.head()

In [None]:
# Checking for missing Value
df.isnull().sum()

# Normalize/standardize numerical features

In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns to scale
numerical_cols = [
    'age', 'previous_gpa', 'attendance_rate', 'participation_score',
    'assignment_completion', 'study_hours_per_week', 'digital_literacy',
    'previous_course_failures', 'extracurricular_hours', 'work_hours_per_week',
    'distance_from_campus'
]

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

### Scaling Numerical Features
- StandardScaler was applied to numerical features to normalize them.
- This helps linear models perform better by ensuring all numerical features have zero mean and unit variance.

# Encode categorical variables

In [None]:
# Check unique values for each categorical column
categorical_cols = [
    'gender', 'socioeconomic_status', 'family_support', 
    'study_resources', 'program_of_study', 'first_generation_student'
]

# Print unique values for each categorical column
for col in categorical_cols:
    print(f"Unique categories in '{col}': {df[col].unique()}")

In [None]:
# Step 1: Copy the dataframe again to be sure
df_encoded = df.copy()

# Step 2: Apply manual mappings as per your instructions
gender_map = {'Male': 1, 'Female': 0, 'Non-binary': 2}
socioeconomic_map = {'Low': 0, 'Middle': 1, 'High': 2}
family_support_map = {'Low': 0, 'Medium': 1, 'High': 2}

df_encoded['gender'] = df_encoded['gender'].map(gender_map)
df_encoded['socioeconomic_status'] = df_encoded['socioeconomic_status'].map(socioeconomic_map)
df_encoded['family_support'] = df_encoded['family_support'].map(family_support_map)

# Step 3: Get dummies only for binary variables (your rule)
df_encoded = pd.get_dummies(df_encoded, columns=['first_generation_student'], drop_first=True)

# Done!
print(df_encoded.head())

### Encoding Categorical Variables
- One-hot encoding was applied to categorical variables such as `gender`, `socioeconomic_status`, `family_support`, `study_resources`, and `program_of_study`.
- `drop_first=True` was used to avoid multicollinearity.

# Document all preprocessing decisions

In [None]:
import os
# Now save the file
df_encoded.to_csv('student2_success_preprocessed.csv', index=False)