<a href="https://colab.research.google.com/github/Rashilajayasinghe/DomainSpecified_AI_Assistant/blob/main/Stage3_4_Data_Adding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random

# 1. Load your original dataset
# Ensure the filename matches exactly what you have in your environment
df_original = pd.read_csv('/content/sample_data/fin_synthetic_samples_cleaned (4).csv')
original_columns = df_original.columns.tolist()

# 2. Define the High-Performance parameters
grades_pool = ['A', 'B']
stages = ['Stage 3', 'Stage 4']
behaviors = [
    'Balancing multiple courses or projects',
    'Not knowing how to prioritize task',
    'Starting tasks too late',
    'Forgetting deadlines or exam dates'
]

# 3. Generate 1,000 High-Performance records
new_records = []
for _ in range(1000):
    stage = random.choice(stages)
    # Mapping values to match your specific CSV column names (handling trailing spaces)
    rec = {
        'Stage': stage,
        'hours_per_week ': random.choice(['5 - 10 hrs', 'More than 10hrs']),
        'academic_stress': random.choice([1, 2]),
        'choosing_new_courses_decision ': random.choice(df_original['choosing_new_courses_decision '].unique()),
        'Mng_academic_tasks': random.choice(df_original['Mng_academic_tasks'].unique()),
        'struggle_with_managing ': 'Rarely',
        'challenges_handling_workload': random.choice(behaviors), # Only one behavior chosen
        ' start_assignments_closer_deadline': 'Rarely/Never',
        'Skill_Programming': random.randint(4, 5),
        'Skill_Math': random.randint(4, 5),
        'Skill_DS_Algo': random.randint(4, 5),
        'Skill_Technical_Comm': random.randint(4, 5),
        'Skill_Web_Development': random.randint(4, 5),
        'Predicted_Success_DL': round(random.uniform(88.0, 98.0), 2)
    }

    # Fill Grade columns with high marks
    for col in original_columns:
        if col.startswith('Grade_CM'):
            if '460' in col and stage == 'Stage 3': # Year 4 courses for Stage 3 student
                rec[col] = 'NOT_TAKEN'
            else:
                rec[col] = random.choice(grades_pool)

    new_records.append(rec)

# 4. Create the synthetic DataFrame and Merge
df_synthetic = pd.DataFrame(new_records)
# Ensure columns match original exactly
df_synthetic = df_synthetic[original_columns]
df_merged = pd.concat([df_original, df_synthetic], ignore_index=True)



In [None]:
import random

# Identify rows for Stage 3 or Stage 4 that need modification
mask = df_merged['Stage'].isin(['Stage 3', 'Stage 4'])

# Apply the condition to the filtered rows
# Using .loc to modify the DataFrame directly and avoid SettingWithCopyWarning
for index in df_merged.loc[mask].index:
    # Randomly pick one course to be 'taken' and the other 'not taken'
    if random.choice([True, False]): # True means CM3602 is taken, False means CM3603 is taken
        df_merged.loc[index, 'Grade_CM3602'] = random.choice(grades_pool)
        df_merged.loc[index, 'Grade_CM3603'] = 'NOT_TAKEN'
    else:
        df_merged.loc[index, 'Grade_CM3603'] = random.choice(grades_pool)
        df_merged.loc[index, 'Grade_CM3602'] = 'NOT_TAKEN'

print("Columns Grade_CM3602 and Grade_CM3603 have been updated in df_merged based on the specified condition for Stage 3 and Stage 4 records.")

Columns Grade_CM3602 and Grade_CM3603 have been updated in df_merged based on the specified condition for Stage 3 and Stage 4 records.


In [None]:
# Identify rows for Stage 4 that need modification for CM4606 and CM4603
mask_stage4 = df_merged['Stage'] == 'Stage 4'

# Apply the condition to the filtered rows for CM4606 and CM4603
for index in df_merged.loc[mask_stage4].index:
    # Randomly pick one course to be 'taken' and the other 'not taken'
    if random.choice([True, False]): # True means CM4606 is taken, False means CM4603 is taken
        df_merged.loc[index, 'Grade_CM4606'] = random.choice(grades_pool)
        df_merged.loc[index, 'Grade_CM4603'] = 'NOT_TAKEN'
    else:
        df_merged.loc[index, 'Grade_CM4603'] = random.choice(grades_pool)
        df_merged.loc[index, 'Grade_CM4606'] = 'NOT_TAKEN'

print("Columns Grade_CM4606 and Grade_CM4603 have been updated in df_merged based on the specified condition for Stage 4 records.")

Columns Grade_CM4606 and Grade_CM4603 have been updated in df_merged based on the specified condition for Stage 4 records.


In [None]:
# 5. Save the file
df_merged.to_csv('final_synthetic_dataset_new.csv', index=False)

# 6. (Optional) Download code for Google Colab
try:
    from google.colab import files
    files.download('final_synthetic_dataset_new.csv')
    print("Download triggered.")
except ImportError:
    print("File saved as 'final_synthetic_dataset_new.csv' in your local directory.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download triggered.
