In [10]:
import pandas as pd
import numpy as np
import re

faculty = pd.read_csv("../data_raw/faculty_info.csv")
survey = pd.read_csv("../data_raw/course_survey.csv")
comments = pd.read_csv("../data_raw/student_comments.csv")

faculty.head(), survey.head(), comments.head()


(  faculty_id      full_name     department
 0     GV100     pham  thi d  khoa kinh  te
 1      gv101  Hoang   van e   Khoa Kinh tế
 2     GV102     LE   VAN  C    Bộ môn Toán
 3      GV103  Hoang   van e  khoa kinh  te
 4      gv104    pham  thi d      Khoa CNTT,
   survey_id faculty_id course_code student_level rating_score survey_term
 0   SRV1000      GV106       DA200        nam  2         3.5      2024-2 
 1   SRV1001      GV120      CT 102    Liên thông          7.0   HK1- 2024
 2   SRV1002      GV111       CT101    Liên thông         3.5    HK1- 2024
 3   SRV1003      gv123       CT101           K18          7.0     2024-2 
 4   SRV1004      GV105       da200         Năm 1         3.5       2024-1,
   survey_id            comment_text sentiment_label
 0   SRV1038    Nội dung mon hoc kho        Tich cuc
 1   SRV1044   Thay co  cham bai ky        tich  cuc
 2   SRV1079  Giang vien day de hieu       tich  cuc
 3   SRV1014     Bài giảng hơi nhanh       tich  cuc
 4   SRV1099    Nội

In [11]:
def normalize_text(x):
    if pd.isna(x):
        return np.nan
    x = str(x)
    x = re.sub(r"\s+", " ", x)   
    x = x.strip()
    return x

In [12]:
faculty["faculty_id"] = (
    faculty["faculty_id"]
    .astype(str)
    .str.upper()
    .str.strip()
    .str.replace(r"[^A-Z0-9]", "", regex=True)
)

faculty["full_name"] = (
    faculty["full_name"]
    .apply(normalize_text)
    .str.title()
)

def clean_department(x):
    if pd.isna(x):
        return np.nan

    x = normalize_text(x).lower()

    if "khoa cntt" in x or "cong nghe thong tin" in x:
        return "Khoa CNTT"
    if "kinh te" in x or x == "kt":
        return "Khoa Kinh Tế"
    if "toan" in x:
        return "Bộ Môn Toán"

    return x.title()

faculty["department"] = faculty["department"].apply(clean_department)

faculty.head()


Unnamed: 0,faculty_id,full_name,department
0,GV100,Pham Thi D,Khoa Kinh Tế
1,GV101,Hoang Van E,Khoa Kinh Tế
2,GV102,Le Van C,Bộ Môn Toán
3,GV103,Hoang Van E,Khoa Kinh Tế
4,GV104,Pham Thi D,Khoa CNTT


In [13]:
survey["faculty_id"] = (
    survey["faculty_id"]
    .astype(str)
    .str.upper()
    .str.strip()
    .str.replace(r"[^A-Z0-9]", "", regex=True)
)

survey["course_code"] = (
    survey["course_code"]
    .astype(str)
    .str.upper()
    .str.replace(" ", "")
)

def clean_student_level(x):
    if pd.isna(x):
        return np.nan
    x = normalize_text(x).lower()

    if "nam 1" in x or "năm 1" in x:
        return "Năm 1"
    if "nam 2" in x or "năm 2" in x:
        return "Năm 2"
    if "lien thong" in x:
        return "Liên Thông"
    if "K18" in x or "k18" in x:
        return "Năm 1"
    if "K17" in x or "k17" in x:
        return "Năm 2"
    if "Lien thong" in x:
        return "Liên Thông"

    return x.title()

survey["student_level"] = survey["student_level"].apply(clean_student_level)


In [14]:
def clean_rating(x):
    if pd.isna(x):
        return np.nan

    x = str(x).strip().replace(",", ".")

    if not re.match(r"^\d+(\.\d+)?$", x):
        return np.nan

    x = float(x)

    if x <= 0:
        return np.nan

    if x > 5 and x <= 10:
        x = x / 2

    if x > 10:
        return np.nan

    return round(x, 2)

survey["rating_score"] = survey["rating_score"].apply(clean_rating)

survey["rating_score"].describe()


count    120.000000
mean       4.062500
std        0.493923
min        3.500000
25%        3.500000
50%        4.000000
75%        4.500000
max        5.000000
Name: rating_score, dtype: float64

In [15]:
def clean_term(x):
    if pd.isna(x):
        return np.nan

    x = normalize_text(x)

    m = re.match(r"(\d{4})[-_/ ]*(\d)", x)
    if m:
        return f"HK{m.group(2)}-{m.group(1)}"

    if re.match(r"HK\d-\d{4}", x.upper()):
        return x.upper()

    return x

survey["survey_term"] = survey["survey_term"].apply(clean_term)


In [16]:
comments["comment_text"] = (
    comments["comment_text"]
    .fillna("")
    .apply(normalize_text)
)

def clean_sentiment(x):
    if pd.isna(x):
        return "Không xác định"

    x = normalize_text(x).lower()

    if x in ["tich cuc", "tích cực", "positive", "pos"]:
        return "Tích cực"
    if x in ["tieu cuc", "tiêu cực", "negative", "neg"]:
        return "Tiêu cực"
    if x in ["trung lap", "neutral"]:
        return "Trung lập"

    return "Không xác định"

comments["sentiment_label"] = comments["sentiment_label"].apply(clean_sentiment)

comments["sentiment_label"].value_counts()


sentiment_label
Tích cực          36
Không xác định    26
Tiêu cực          19
Trung lập          9
Name: count, dtype: int64

In [17]:

survey_clean = survey.dropna(subset=["rating_score"]).copy()

print("Trước:", survey.shape)
print("Sau:", survey_clean.shape)


Trước: (120, 6)
Sau: (120, 6)


In [18]:
faculty.to_csv("../data_clean/faculty_info_clean.csv", index=False)
survey.to_csv("../data_clean/course_survey_clean.csv", index=False)
comments.to_csv("../data_clean/student_comments_clean.csv", index=False)
