In [7]:
import pandas as pd
import random
df = pd.read_csv('diem_thi_thpt_2024.csv', sep=',')

df.head()

Unnamed: 0,sbd,toan,ngu_van,ngoai_ngu,vat_li,hoa_hoc,sinh_hoc,lich_su,dia_li,gdcd,ma_ngoai_ngu
0,1000001,8.4,6.75,8.0,6.0,5.25,5.0,,,,N1
1,1000002,8.6,8.5,7.2,,,,7.25,6.0,8.0,N1
2,1000003,8.2,8.75,8.2,,,,7.25,7.25,8.75,N1
3,1000004,4.8,8.5,7.4,,,,7.0,6.0,7.5,N1
4,1000005,8.6,9.0,7.8,,,,9.0,8.75,8.5,N1


In [8]:
#Cleaning the data
def vietnamese_to_english(column_name):
    translations = {
        'sbd': 'student_id',
        'toan': 'math',
        'ngu_van': 'literature',
        'ngoai_ngu': 'english',
        'vat_li': 'physics',
        'hoa_hoc': 'chemistry',
        'sinh_hoc': 'biology',
        'lich_su': 'history',
        'dia_li': 'geography',
        'gdcd': 'civic_education',
        'ma_ngoai_ngu': 'foreign_language_code'
    }
    return translations.get(column_name, column_name)

df.columns = [vietnamese_to_english(col) for col in df.columns]

In [9]:
# Function to check if a student is in social science
def is_social_science(row):
    return pd.notna(row['civic_education']) and pd.notna(row['history']) and pd.notna(row['geography'])

# Function to check if a student is in natural sciences
def is_natural_science(row):
    return pd.notna(row['chemistry']) and pd.notna(row['physics']) and pd.notna(row['biology'])

social_science = df[df.apply(is_social_science, axis=1)].copy()
natural_science = df[df.apply(is_natural_science, axis=1)].copy()

In [4]:
# Define subject columns for each category
common_subjects = ['student_id', 'math', 'literature', 'english']
social_subjects = ['history', 'geography', 'civic_education']
science_subjects = ['physics', 'chemistry', 'biology']

social_science = social_science[common_subjects + social_subjects]
natural_science = natural_science[common_subjects + science_subjects]

In [10]:
social_science.isna().sum()

student_id                    0
math                       1849
literature                   84
english                    8030
physics                  583106
chemistry                583106
biology                  583106
history                       0
geography                     0
civic_education               0
foreign_language_code      8030
dtype: int64

In [11]:
natural_science.isna().sum()

student_id                    0
math                          7
literature                 2714
english                   12178
physics                       0
chemistry                     0
biology                       0
history                  339787
geography                339787
civic_education          339787
foreign_language_code     12178
dtype: int64

In [12]:
# Handle English missing values
social_subjects = ['history', 'geography', 'civic_education']
science_subjects = ['physics', 'chemistry', 'biology']

def handle_english(df):
    possible_values = [8.4, 8.6, 8.8, 9.0, 9.2, 9.4]
    df.loc[df['english'].isna(), 'english'] = df['english'].apply(lambda x: random.choice(possible_values) if pd.isna(x) else x)
    return df

social_science = handle_english(social_science)
natural_science = handle_english(natural_science)

In [13]:
# Fill missing values with 7.00 (Pure Estimate)
social_science.fillna(7.00, inplace=True)
natural_science.fillna(7.00, inplace=True)

In [14]:
# Round all scores to 2 decimal places
score_columns = ['math', 'literature', 'english'] + social_subjects + science_subjects
for df in [social_science, natural_science]:
    for col in score_columns:
        if col in df.columns:
            df[col] = df[col].round(2)

# Save the data            
social_science.to_csv('social_science_students.csv', index=False)
natural_science.to_csv('natural_science_students.csv', index=False)
print("\nData has been saved to 'social_studies_students.csv' and 'natural_sciences_students.csv'")


Data has been saved to 'social_studies_students.csv' and 'natural_sciences_students.csv'
