# train-test split

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('student_performance_dataset.csv')

# Perform 80-20 train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Verify the split
print(f"Training set shape: {train_df.shape} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Testing set shape: {test_df.shape} ({len(test_df)/len(df)*100:.1f}%)")

# Save the splits to CSV files
train_df.to_csv('train_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)
print("Train and test datasets saved as 'train_dataset.csv' and 'test_dataset.csv'")

Training set shape: (724, 52) (80.0%)
Testing set shape: (181, 52) (20.0%)
Train and test datasets saved as 'train_dataset.csv' and 'test_dataset.csv'


In [3]:
import pandas as pd
import os
from nbformat import v4 as nbformat_v4
from nbformat import write as nb_write

# Load the training dataset
train_df = pd.read_csv('train_dataset.csv')

# List of columns to drop (unnecessary for modeling)
columns_to_drop = [
    'Student ID', 'Div-1', 'Div-2', 'Div-3', 
    'Mentor-1', 'Mentor-2', 'Mentor-3', 
    'Communication Theory', 'Law Theory', 
    'DE Practical', 'FSD Practical', 'Python Practical'
]

# Drop unnecessary columns
train_df_cleaned = train_df.drop(columns=columns_to_drop, errors='ignore')

# Define the four subjects and their theory columns
subjects = {
    'math': 'Math-3 Theory',
    'de': 'DE Theory',
    'fsd': 'FSD Theory',
    'python': 'Python Theory'
}

# Create folders and process data for each subject
for subject, target_column in subjects.items():
    # Create folder for the subject
    folder_name = f"{subject}_model"
    os.makedirs(folder_name, exist_ok=True)
    
    # Store the target column in temporary storage (e.g., variable)
    temp_target = train_df_cleaned[target_column].copy()
    
    # Create a new DataFrame for the subject
    # Drop the other three subjects' theory columns to avoid leakage
    other_subjects = [col for col in subjects.values() if col != target_column]
    subject_df = train_df_cleaned.drop(columns=other_subjects, errors='ignore')
    
    # Add target column 'Y' with the subject's theory marks
    subject_df['Y'] = temp_target
    
    # Save the DataFrame to a CSV file in the subject's folder
    csv_path = os.path.join(folder_name, f"{subject}_train_data.csv")
    subject_df.to_csv(csv_path, index=False)
    print(f"Saved {subject}_train_data.csv in {folder_name}/")
    
    # Create an empty Jupyter Notebook for EDA
    nb = nbformat_v4.new_notebook()
    nb_path = os.path.join(folder_name, f"{subject}_eda.ipynb")
    with open(nb_path, 'w') as f:
        nb_write(nb, f)
    print(f"Created {subject}_eda.ipynb in {folder_name}/")

print("Processing complete. Four folders created with respective CSV files and EDA notebooks.")

Saved math_train_data.csv in math_model/
Created math_eda.ipynb in math_model/
Saved de_train_data.csv in de_model/
Created de_eda.ipynb in de_model/
Saved fsd_train_data.csv in fsd_model/
Created fsd_eda.ipynb in fsd_model/
Saved python_train_data.csv in python_model/
Created python_eda.ipynb in python_model/
Processing complete. Four folders created with respective CSV files and EDA notebooks.
