### Import Lybrary

In [14]:
import pandas as pd
import numpy as np

In [None]:
file_list = [
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv',
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv',
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv',
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv' 
]
df_national_examination_board = pd.read_excel(r"C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\danh_sach_hoi_dong_thi.xlsx")

class NationalHighSchoolExamScore:
    def __init__(self, file_paths,df_national_examination_board):
        self.file_paths = file_paths
        self.dataframes = []
        self.df_national_examination_board = df_national_examination_board

    def read_data(self):
        for path in self.file_paths:
            try:
                df = pd.read_csv(path)
                self.dataframes.append((path, df))
                print(f"Complete reading the file: {path}")
            except Exception as e:
                print(f"Unable to read file {path}: {e}")

    def check_data(self):
        if not self.dataframes:
            print("No data available")
            return

        for (path, df) in self.dataframes:
            print(f"\nCheck data: {path}")
            print(f"Row numbers: {df.shape[0]}, Column numbers: {df.shape[1]}")
            print("Columns:", list(df.columns))
            print("Duplicate Values:", df.duplicated().sum())
            print("-" * 60)

    def remove_duplicate(self):
        if not self.dataframes:
            print("No data to process.")
            return

        for i, (path, df) in enumerate(self.dataframes):
            before = df.shape[0]
            df_cleaned = df.drop_duplicates()
            after = df_cleaned.shape[0]
            self.dataframes[i] = (path, df_cleaned)
            print(f"\nRemove {before - after} duplicate rows from file: {path}")
            
    def drop_specific_columns(self, columns_to_drop_by_file):
        if not self.dataframes:
            print("No data to process.")
            return
        
        for i, (path, df) in enumerate(self.dataframes):
            for keyword, columns in columns_to_drop_by_file.items():
                if keyword in path:
                    existing_cols = [col for col in columns if col in df.columns]
                    df = df.drop(columns=existing_cols)
                    self.dataframes[i] = (path, df)
                    print(f"\nDropped columns from file {path}: {existing_cols if existing_cols else 'No matching columns found'}")
                    break 
    def rename_columns(self, rename_rules_by_file):
        if not self.dataframes:
            print("No data to process.")
            return

        for i, (path, df) in enumerate(self.dataframes):
            for keyword, rename_map in rename_rules_by_file.items():
                if keyword in path:
                    existing_renames = {old: new for old, new in rename_map.items() if old in df.columns}
                    df = df.rename(columns=existing_renames)
                    self.dataframes[i] = (path, df)
                    print(f"\nRenamed columns in file {path}: {existing_renames if existing_renames else 'No matching columns to rename'}")
                    break
    def add_column_code_year_khtn_khxh(self):
        if not self.dataframes:
            print("No data to process.")
            return

        start_year = 2020

        for i, (path, df) in enumerate(self.dataframes):
            print(f"\nProcessing file: {path}")

            # Thêm cột 'code' nếu chưa có
            if 'code' not in df.columns:
                if 'sbd' in df.columns:
                    try:
                        df['code'] = df['sbd'].astype(str).str[:2].astype(int)
                        print("Added column 'code'")
                    except Exception as e:
                        print(f"Error adding 'code': {e}")
                else:
                    print("'sbd' column not found.")
            else:
                print("'code' already exists.")

            # Thêm cột 'year' nếu chưa có
            if 'year' not in df.columns:
                df['year'] = start_year + i
                print(f"Added column 'year' = {start_year + i}")

            # Thêm cột 'khtn'
            cols_khtn = ['vat_li', 'hoa_hoc', 'sinh_hoc']
            if all(col in df.columns for col in cols_khtn):
                df['khtn'] = df[cols_khtn].sum(axis=1)
                df['khtn'] = df['khtn'].where(df[cols_khtn].notnull().all(axis=1))
                print("Added column 'khtn'")
            else:
                print(f"Missing KHTN columns: {', '.join([col for col in cols_khtn if col not in df.columns])}")

            # Thêm cột 'khxh'
            cols_khxh = ['lich_su', 'dia_li', 'gdcd']
            if all(col in df.columns for col in cols_khxh):
                df['khxh'] = df[cols_khxh].sum(axis=1)
                df['khxh'] = df['khxh'].where(df[cols_khxh].notnull().all(axis=1))
                print("Added column 'khxh'")
            else:
                print(f"Missing KHXH columns: {', '.join([col for col in cols_khxh if col not in df.columns])}")

            # Cập nhật lại
            self.dataframes[i] = (path, df)



    def check_data_column_year(self):
        if not self.dataframes:
            print("No data to process")
            return
        for path, df in self.dataframes:
            print(df["year"].unique())

    def reorder_all_columns(self):
            if not self.dataframes:
                print("No data to process.")
                return

            desired_order = [
                'sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc',
                'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu','code', 'khtn',
                'khxh', 'year'
            ]

            for i, (path, df) in enumerate(self.dataframes):
                existing = [col for col in desired_order if col in df.columns]
                remaining = [col for col in df.columns if col not in desired_order]
                new_order = existing + remaining
                df = df[new_order]
                self.dataframes[i] = (path, df)
                print(f"Successfully reordered columns for the file: {path}")
        
    def concat_all(self, save_path="C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\processed.csv"):
        if not self.dataframes:
            print("No data available to concatenate.")
            return None

        try:
            all_dfs = [df for _, df in self.dataframes]
            df_concat = pd.concat(all_dfs, ignore_index=True)
            df_concat.to_csv(save_path, index=False, encoding='utf-8-sig')
            print(f"Successfully concatenated {len(all_dfs)} DataFrames.")
            print(f"File saved to: {save_path}")
            return df_concat
        except Exception as e:
            print(f"Error during concatenation: {e}")
            return None



In [16]:
data = NationalHighSchoolExamScore(file_list,df_national_examination_board)

#### Read Data CSV

In [17]:
data.read_data()

  df = pd.read_csv(path)


Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv


#### Check Information Data

In [18]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1857877, Column numbers: 16
Columns: ['SBD', 'Tên', 'Ngày Sinh', 'Giới tính', 'Toán', 'Văn', 'Lý', 'Hoá', 'Sinh', 'Lịch Sử', 'Địa Lý', 'GDCD', 'Ngoại Ngữ', 'Year', 'code', 'province']
Duplicate Values: 4591
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 11
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ma_ngoai_ngu']
Duplicate Values: 0
-----------

#### Remove Lines Duplicate

In [19]:
data.remove_duplicate()


Remove 4591 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv

Remove 0 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv

Remove 0 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv

Remove 0 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv


#### Check Information Data

In [20]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 16
Columns: ['SBD', 'Tên', 'Ngày Sinh', 'Giới tính', 'Toán', 'Văn', 'Lý', 'Hoá', 'Sinh', 'Lịch Sử', 'Địa Lý', 'GDCD', 'Ngoại Ngữ', 'Year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 11
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ma_ngoai_ngu']
Duplicate Values: 0
--------------

#### Drop Columns Specific

In [21]:
columns_to_drop = {
    "2020_2021": ["Tên", "Ngày Sinh", "Giới tính"],
    "2023": ["ma_ngoai_ngu"],
    "2024": ["ma_ngoai_ngu"]
}
data.drop_specific_columns(columns_to_drop)


Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv: ['Tên', 'Ngày Sinh', 'Giới tính']

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv: No matching columns found

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv: ['ma_ngoai_ngu']

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv: ['ma_ngoai_ngu']


#### Check Information Data

In [22]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['SBD', 'Toán', 'Văn', 'Lý', 'Hoá', 'Sinh', 'Lịch Sử', 'Địa Lý', 'GDCD', 'Ngoại Ngữ', 'Year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

C

#### Rename Column

In [23]:
data.rename_columns({
    "2020_2021": {"SBD": "sbd", "Toán": "toan", "Văn": "ngu_van", "Ngoại Ngữ": "ngoai_ngu", "Lý": "vat_li", "Hoá": "hoa_hoc", 
                  "Sinh": "sinh_hoc", "Lịch Sử": "lich_su", "Địa Lý": "dia_li", "GDCD": "gdcd", "Year": "year"}
})



Renamed columns in file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv: {'SBD': 'sbd', 'Toán': 'toan', 'Văn': 'ngu_van', 'Ngoại Ngữ': 'ngoai_ngu', 'Lý': 'vat_li', 'Hoá': 'hoa_hoc', 'Sinh': 'sinh_hoc', 'Lịch Sử': 'lich_su', 'Địa Lý': 'dia_li', 'GDCD': 'gdcd', 'Year': 'year'}


#### Check Information Data

In [24]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
-----------------------------------------------

#### Add Columns Code And Year

In [25]:
data.add_column_code_year_khtn_khxh()


📄 Processing file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
ℹ️ 'code' already exists.
✅ Added column 'khtn'
✅ Added column 'khxh'

📄 Processing file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
✅ Added column 'code'
✅ Added column 'year' = 2021
✅ Added column 'khtn'
✅ Added column 'khxh'

📄 Processing file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
✅ Added column 'code'
✅ Added column 'year' = 2022
✅ Added column 'khtn'
✅ Added column 'khxh'

📄 Processing file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv
✅ Added column 'code'
✅ Added column 'year' = 2023
✅ Added column 'khtn'
✅ Added column 'khxh'


#### Check Information Data

In [26]:
data.check_data_column_year()

[2020 2021]
[2021]
[2022]
[2023]


#### Reorder The Columns

In [27]:
data.reorder_all_columns()

Successfully reordered columns for the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Successfully reordered columns for the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Successfully reordered columns for the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Successfully reordered columns for the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv


#### Check Information Data

In [28]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 15
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'code', 'khtn', 'khxh', 'year', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 14
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'code', 'khtn', 'khxh', 'year']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 14
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'code', 'khtn', 'kh

In [29]:
data.drop_specific_columns({
    "2020_2021": ["province"]
})


Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv: ['province']


#### Concat Table

In [30]:
df_national_high_school_exam_score = data.concat_all()

Successfully concatenated 4 DataFrames.
File saved to: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\processed.csv


#### Check Information Data

In [31]:
df_national_high_school_exam_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932392 entries, 0 to 4932391
Data columns (total 14 columns):
 #   Column     Dtype  
---  ------     -----  
 0   sbd        int64  
 1   toan       float64
 2   ngu_van    float64
 3   vat_li     float64
 4   hoa_hoc    float64
 5   sinh_hoc   float64
 6   lich_su    float64
 7   dia_li     float64
 8   gdcd       float64
 9   ngoai_ngu  float64
 10  code       int64  
 11  khtn       float64
 12  khxh       float64
 13  year       int64  
dtypes: float64(11), int64(3)
memory usage: 526.8 MB


#### Rename column danh_sach_hoi_dong_thi.xlsx

In [32]:
df_national_examination_board.rename(columns={
    "Mã hội đồng": "code",
    "Tên hội đồng thi": "national examination board",
    "Tên Tỉnh": "province"
}, inplace=True)

df_national_examination_board.to_csv(r"C:\FPT Polytechnic\Graduation_Project\Data\Processed\national_examination_board.csv", index=False)
df_region_of_vietnam = pd.read_excel(r"C:\FPT Polytechnic\Graduation_Project\Data\Raw\Regions_of_VietNam.xlsx")
df_national_examination_board_transform = pd.merge(df_national_examination_board, df_region_of_vietnam, how="left", left_on='province', right_on='Province')
df_national_examination_board_transform.info()
df_national_examination_board_transform.drop(columns='Province', inplace=True)
df_national_examination_board_transform.rename(columns={'Regions': 'regions'}, inplace=True)
df_national_examination_board_transform.to_csv(r'C:\FPT Polytechnic\Graduation_Project\Data\Processed\national_examination_board_transform.csv', index=False)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   code                        64 non-null     int64 
 1   national examination board  64 non-null     object
 2   province                    64 non-null     object
 3   Province                    63 non-null     object
 4   Regions                     63 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.6+ KB


In [33]:
df_national_high_school_exam_score

Unnamed: 0,sbd,toan,ngu_van,vat_li,hoa_hoc,sinh_hoc,lich_su,dia_li,gdcd,ngoai_ngu,code,khtn,khxh,year
0,18014547,6.4,6.75,,,,4.75,7.00,6.50,4.2,18,,18.25,2020
1,18014530,7.6,6.00,,,,3.75,7.75,7.75,2.8,18,,19.25,2020
2,18014521,4.8,4.75,,,,4.00,6.50,,,18,,,2020
3,18014517,8.0,7.00,,,,8.25,8.00,9.50,5.8,18,,25.75,2020
4,18014523,8.2,6.50,8.00,8.5,5.0,,,,4.0,18,21.50,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4932387,64007282,4.2,6.00,,,,5.25,6.25,7.50,3.0,64,,19.00,2023
4932388,64007283,5.2,6.00,,,,6.25,5.75,7.75,3.6,64,,19.75,2023
4932389,64007284,7.2,8.00,6.25,7.0,8.0,,,,7.0,64,21.25,,2023
4932390,64007285,5.2,5.25,,,,4.75,6.50,8.75,4.4,64,,20.00,2023


In [34]:
df_national_high_school_exam_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932392 entries, 0 to 4932391
Data columns (total 14 columns):
 #   Column     Dtype  
---  ------     -----  
 0   sbd        int64  
 1   toan       float64
 2   ngu_van    float64
 3   vat_li     float64
 4   hoa_hoc    float64
 5   sinh_hoc   float64
 6   lich_su    float64
 7   dia_li     float64
 8   gdcd       float64
 9   ngoai_ngu  float64
 10  code       int64  
 11  khtn       float64
 12  khxh       float64
 13  year       int64  
dtypes: float64(11), int64(3)
memory usage: 526.8 MB
