In [None]:
import pandas as pd

file_list = [
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv',
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv',
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv',
    r'C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv' 
]

class NationalHighSchoolExamScore:
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.dataframes = []

    def read_data(self):
        for path in self.file_paths:
            try:
                df = pd.read_csv(path)
                self.dataframes.append((path, df))
                print(f"Complete reading the file: {path}")
            except Exception as e:
                print(f"Unable to read file {path}: {e}")

    def check_data(self):
        if not self.dataframes:
            print("No data available")
            return

        for (path, df) in self.dataframes:
            print(f"\nCheck data: {path}")
            print(f"Row numbers: {df.shape[0]}, Column numbers: {df.shape[1]}")
            print("Columns:", list(df.columns))
            print("Duplicate Values:", df.duplicated().sum())
            print("-" * 60)

    def remove_duplicate(self):
        if not self.dataframes:
            print("No data to process.")
            return

        for i, (path, df) in enumerate(self.dataframes):
            before = df.shape[0]
            df_cleaned = df.drop_duplicates()
            after = df_cleaned.shape[0]
            self.dataframes[i] = (path, df_cleaned)
            print(f"\nRemove {before - after} duplicate rows from file: {path}")
            
    def drop_specific_columns(self, columns_to_drop_by_file):
        if not self.dataframes:
            print("No data to process.")
            return
        
        for i, (path, df) in enumerate(self.dataframes):
            for keyword, columns in columns_to_drop_by_file.items():
                if keyword in path:
                    existing_cols = [col for col in columns if col in df.columns]
                    df = df.drop(columns=existing_cols)
                    self.dataframes[i] = (path, df)
                    print(f"\nDropped columns from file {path}: {existing_cols if existing_cols else 'No matching columns found'}")
                    break 
    def rename_columns(self, rename_rules_by_file):
        if not self.dataframes:
            print("No data to process.")
            return

        for i, (path, df) in enumerate(self.dataframes):
            for keyword, rename_map in rename_rules_by_file.items():
                if keyword in path:
                    existing_renames = {old: new for old, new in rename_map.items() if old in df.columns}
                    df = df.rename(columns=existing_renames)
                    self.dataframes[i] = (path, df)
                    print(f"\nRenamed columns in file {path}: {existing_renames if existing_renames else 'No matching columns to rename'}")
                    break
    def add_column_code_year(self):
        if not self.dataframes:
            print("No data to process.")
            return

        for i, (path, df) in enumerate(self.dataframes):
            if 'code' not in df.columns:
                if 'sbd' in df.columns:
                    try:
                        df['code'] = df['sbd'].astype(str).str[:2]
                        df['code'] = df['code'].astype(int)
                        print(f"Add column 'code' to file: {path}")
                    except Exception as e:
                        print(f"Processing error 'code' in file {path}: {e}")
                else:
                    print(f"'sbd' column not found in file: {path}")
            else:
                print(f"'code' already exists in file: {path}")


            if 'year' not in df.columns:
                df['year'] = 2022 + i - 1
            self.dataframes[i] = (path, df)


    def check_data_column_year(self):
        if not self.dataframes:
            print("No data to process")
            return
        for path, df in self.dataframes:
            print(df["year"].unique())

    def merge_table(self):
        df_danh_sạch_hoi_dong_thi = pd.read_excel(r"C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\danh_sach_hoi_dong_thi.xlsx")
        if not self.dataframes:
            print("No data to process")
            return
        
        for i, (path, df) in enumerate(self.dataframes):
            if 'province' not in df.columns:
                 df_merged = pd.merge(df, df_danh_sạch_hoi_dong_thi, left_on='code', right_on='Mã hội đồng', how='left')
                 self.dataframes[i] = (path, df_merged)

data = NationalHighSchoolExamScore(file_list)

In [17]:
data.read_data()

  df = pd.read_csv(path)


Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Complete reading the file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv


In [18]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1857877, Column numbers: 16
Columns: ['SBD', 'Tên', 'Ngày Sinh', 'Giới tính', 'Toán', 'Văn', 'Lý', 'Hoá', 'Sinh', 'Lịch Sử', 'Địa Lý', 'GDCD', 'Ngoại Ngữ', 'Year', 'code', 'province']
Duplicate Values: 4591
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 11
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ma_ngoai_ngu']
Duplicate Values: 0
-----------

In [19]:
data.remove_duplicate()


Remove 4591 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv

Remove 0 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv

Remove 0 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv

Remove 0 duplicate rows from file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv


In [20]:
data.check_data()  


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 16
Columns: ['SBD', 'Tên', 'Ngày Sinh', 'Giới tính', 'Toán', 'Văn', 'Lý', 'Hoá', 'Sinh', 'Lịch Sử', 'Địa Lý', 'GDCD', 'Ngoại Ngữ', 'Year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 11
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ma_ngoai_ngu']
Duplicate Values: 0
--------------

In [21]:
columns_to_drop = {
    "2020_2021": ["Tên", "Ngày Sinh", "Giới tính"],
    "2023": ["ma_ngoai_ngu"],
    "2024": ["ma_ngoai_ngu"]
}
data.drop_specific_columns(columns_to_drop)



Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv: ['Tên', 'Ngày Sinh', 'Giới tính']

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv: No matching columns found

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv: ['ma_ngoai_ngu']

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv: ['ma_ngoai_ngu']


In [22]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['SBD', 'Toán', 'Văn', 'Lý', 'Hoá', 'Sinh', 'Lịch Sử', 'Địa Lý', 'GDCD', 'Ngoại Ngữ', 'Year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

C

In [23]:
data.rename_columns({
    "2020_2021": {"SBD": "sbd", "Toán": "toan", "Văn": "ngu_van", "Ngoại Ngữ": "ngoai_ngu", "Lý": "vat_li", "Hoá": "hoa_hoc", 
                  "Sinh": "sinh_hoc", "Lịch Sử": "lich_su", "Địa Lý": "dia_li", "GDCD": "gdcd", "Year": "year"}
})



Renamed columns in file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv: {'SBD': 'sbd', 'Toán': 'toan', 'Văn': 'ngu_van', 'Ngoại Ngữ': 'ngoai_ngu', 'Lý': 'vat_li', 'Hoá': 'hoa_hoc', 'Sinh': 'sinh_hoc', 'Lịch Sử': 'lich_su', 'Địa Lý': 'dia_li', 'GDCD': 'gdcd', 'Year': 'year'}


In [24]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 10
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd']
Duplicate Values: 0
-----------------------------------------------

In [25]:
data.add_column_code_year()

'code' already exists in file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Add column 'code' to file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Add column 'code' to file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Add column 'code' to file: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv


In [26]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 12
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'code', 'year']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 12
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'code', 'year']
Duplicate Values: 0
---------------

In [27]:
data.check_data_column_year()

[2020 2021]
[2022]
[2023]
[2024]


In [28]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 12
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'code', 'year']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 12
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'code', 'year']
Duplicate Values: 0
---------------

In [None]:
data.merge_table()

In [30]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 15
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'code', 'year', 'Mã hội đồng', 'Tên hội đồng thi', 'Tên Tỉnh']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 15
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'cod

In [None]:
columns_to_drop = {
    "2022": ["Mã hội đồng", "Tên hội đồng thi","code"],
    "2023": ["Mã hội đồng", "Tên hội đồng thi","code"],
    "2024": ["Mã hội đồng", "Tên hội đồng thi","code"]
}
data.drop_specific_columns(columns_to_drop)


Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv: No matching columns found

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv: ['Mã hội đồng', 'Tên hội đồng thi']

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv: ['Mã hội đồng', 'Tên hội đồng thi']

Dropped columns from file C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2024.csv: ['Mã hội đồng', 'Tên hội đồng thi']


In [32]:
data.check_data()


Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_2020_2021.csv
Row numbers: 1853286, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'ngoai_ngu', 'year', 'code', 'province']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2022.csv
Row numbers: 995441, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'code', 'year', 'Tên Tỉnh']
Duplicate Values: 0
------------------------------------------------------------

Check data: C:\FPT Polytechnic\Project Tự Làm\Điểm thi thpt 2020 - 2024\diem_thi_thpt_2023.csv
Row numbers: 1022060, Column numbers: 13
Columns: ['sbd', 'toan', 'ngu_van', 'ngoai_ngu', 'vat_li', 'hoa_hoc', 'sinh_hoc', 'lich_su', 'dia_li', 'gdcd', 'code', 'year', 'Tên Tỉnh']
Duplicate V

In [9]:
import pandas as pd

In [10]:
df_concat = pd.read_csv(r"C:\FPT Polytechnic\Graduation_Project\Data\Processed\processe.csv")
df_concat2 = pd.read_csv(r"C:\FPT Polytechnic\Graduation_Project\Data\Processed\processed.csv")
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932392 entries, 0 to 4932391
Data columns (total 12 columns):
 #   Column     Dtype  
---  ------     -----  
 0   sbd        int64  
 1   toan       float64
 2   ngu_van    float64
 3   vat_li     float64
 4   hoa_hoc    float64
 5   sinh_hoc   float64
 6   lich_su    float64
 7   dia_li     float64
 8   gdcd       float64
 9   ngoai_ngu  float64
 10  year       int64  
 11  province   object 
dtypes: float64(9), int64(2), object(1)
memory usage: 451.6+ MB


In [11]:
df_concat2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932392 entries, 0 to 4932391
Data columns (total 12 columns):
 #   Column     Dtype  
---  ------     -----  
 0   sbd        int64  
 1   toan       float64
 2   ngu_van    float64
 3   vat_li     float64
 4   hoa_hoc    float64
 5   sinh_hoc   float64
 6   lich_su    float64
 7   dia_li     float64
 8   gdcd       float64
 9   ngoai_ngu  float64
 10  year       int64  
 11  province   object 
dtypes: float64(9), int64(2), object(1)
memory usage: 451.6+ MB
