In [22]:
import pandas as pd
from ydata_profiling import ProfileReport

df = pd.read_csv("C:/Users/VIGNEESH GPL/BigData/data2/student_performance.csv")

In [23]:
# Dataset
print("Shape:", df.shape)
print("\nOriginal Columns:", df.columns.tolist())

# Normalize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

Shape: (14003, 16)

Original Columns: ['StudyHours', 'Attendance', 'Resources', 'Extracurricular', 'Motivation', 'Internet', 'Gender', 'Age', 'LearningStyle', 'OnlineCourses', 'Discussions', 'AssignmentCompletion', 'ExamScore', 'EduTech', 'StressLevel', 'FinalGrade']


In [24]:
# Handle Missing or Encoded Categorical Data / Replace empty strings with NaN
df.replace("", pd.NA, inplace=True)

# Fill missing values in 'learningstyle', 'stresslevel', 'finalgrade' with most common value
for col in ["learningstyle", "stresslevel", "finalgrade"]:
    if col in df.columns:
        if df[col].isna().sum() > 0:
            mode_value = df[col].mode(dropna=True)[0]
            df[col].fillna(mode_value, inplace=True)
            print(f"Filled missing values in '{col}' with mode: {mode_value}")

# Assign Variable
decode_maps = {
    "gender": {0: "Male", 1: "Female"},
    "resources": {0: "None", 1: "Some", 2: "Many"},
    "extracurricular": {0: "No", 1: "Yes"},
    "motivation": {0: "Low", 1: "Medium", 2: "High"},
    "discussions": {0: "Rarely", 1: "Often"},
    "edutech": {0: "No", 1: "Yes"},
    "internet": {0: "No", 1: "Yes"}, 
    "learningstyle": {1: "Visual", 2: "Auditory", 3: "Kinesthetic"},
    "stresslevel": {0: "Low", 1: "Medium", 2: "High"},
    "finalgrade": {0: "Fail", 1: "Average", 2: "Good", 3: "Excellent"}
}

for col, mapping in decode_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

if "learningstyle" in df.columns:
    df["learningstyle"].fillna(df["learningstyle"].mode(dropna=True)[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["learningstyle"].fillna(df["learningstyle"].mode(dropna=True)[0], inplace=True)


In [26]:
# Summary
print(df.dtypes)
print(df.head(10))

# Save Preprocessed Dataset
output_path = r"C:\Users\VIGNEESH GPL\BigData\data2\student_performance_preprocessed.csv"
df.to_csv(output_path, index=False)
print(f"\nPreprocessed dataset saved successfully to:\n{output_path}")

studyhours               int64
attendance               int64
resources               object
extracurricular         object
motivation              object
internet                object
gender                  object
age                      int64
learningstyle           object
onlinecourses            int64
discussions             object
assignmentcompletion     int64
examscore                int64
edutech                 object
stresslevel             object
finalgrade              object
dtype: object
   studyhours  attendance resources extracurricular motivation internet  \
0          19          64      Some              No        Low      Yes   
1          19          64      Some              No        Low      Yes   
2          19          64      Some              No        Low      Yes   
3          19          64      Some             Yes        Low      Yes   
4          19          64      Some             Yes        Low      Yes   
5          19          64      Some     

In [18]:
# After saving preprocessed dataset
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Student Performance Data Report", explorative=True)
profile.to_file("C:/Users/VIGNEESH GPL/BigData/data2/preprocessing_report.html")
print("Profiling report generated successfully.")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|          | 0/16 [00:00<?, ?it/s]
[A%|███▊      | 6/16 [00:00<00:00, 23.48it/s]
100%|██████████| 16/16 [00:00<00:00, 43.83it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report generated successfully.
