In [12]:
import pandas as pd
import numpy as np

faculty = pd.read_csv("../data_raw/faculty_info.csv")
survey = pd.read_csv("../data_raw/course_survey.csv")
comments = pd.read_csv("../data_raw/student_comments.csv")

faculty.head(), survey.head(), comments.head()


(  faculty_id      full_name     department
 0     GV100     pham  thi d  khoa kinh  te
 1      gv101  Hoang   van e   Khoa Kinh tế
 2     GV102     LE   VAN  C    Bộ môn Toán
 3      GV103  Hoang   van e  khoa kinh  te
 4      gv104    pham  thi d      Khoa CNTT,
   survey_id faculty_id course_code student_level rating_score survey_term
 0   SRV1000      GV106       DA200        nam  2         3.5      2024-2 
 1   SRV1001      GV120      CT 102    Liên thông          7.0   HK1- 2024
 2   SRV1002      GV111       CT101    Liên thông         3.5    HK1- 2024
 3   SRV1003      gv123       CT101           K18          7.0     2024-2 
 4   SRV1004      GV105       da200         Năm 1         3.5       2024-1,
   survey_id            comment_text sentiment_label
 0   SRV1038    Nội dung mon hoc kho        Tich cuc
 1   SRV1044   Thay co  cham bai ky        tich  cuc
 2   SRV1079  Giang vien day de hieu       tich  cuc
 3   SRV1014     Bài giảng hơi nhanh       tich  cuc
 4   SRV1099    Nội

In [13]:
def clean_faculty_id(x):
    if pd.isna(x):
        return np.nan
    return str(x).strip().upper()

def clean_text(x):
    if pd.isna(x):
        return np.nan
    return " ".join(str(x).strip().title().split())

def clean_department(x):
    if pd.isna(x):
        return np.nan

    x = str(x).lower().strip()

    mapping = {
        "cntt": "Khoa CNTT",
        "cong nghe thong tin": "Khoa CNTT",
        "kt": "Khoa Kinh Tế",
        "kinh te": "Khoa Kinh Tế",
        "toan": "Bộ Môn Toán",
    }

    for k, v in mapping.items():
        if k in x:
            return v
    
    return x.title()

faculty["faculty_id"] = faculty["faculty_id"].apply(clean_faculty_id)
faculty["full_name"] = faculty["full_name"].apply(clean_text)
faculty["department"] = faculty["department"].apply(clean_department)

faculty.head()


Unnamed: 0,faculty_id,full_name,department
0,GV100,Pham Thi D,Khoa Kinh Te
1,GV101,Hoang Van E,Khoa Kinh Tế
2,GV102,Le Van C,Bộ Môn Toán
3,GV103,Hoang Van E,Khoa Kinh Te
4,GV104,Pham Thi D,Khoa CNTT


In [14]:
def clean_course_code(x):
    if pd.isna(x):
        return np.nan
    return str(x).replace(" ", "").upper()

def clean_student_level(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip().upper()
    
    mapping = {
        "K17": "K17",
        "K18": "K18",
        "NAM1": "Năm 1",
        "NAM2": "Năm 2",
        "LIENTHONG": "Liên thông"
    }

    for k, v in mapping.items():
        if k in x:
            return v

    return x.title()

def clean_rating(x):
    try:
        x = str(x).replace(",", ".")
        x = float(x)

        # Nếu thang 10 → chuyển về thang 5
        if x > 10:
            return np.nan
        if x > 5:
            return round(x / 2, 2)
        return x

    except:
        return np.nan

def clean_term(x):
    if pd.isna(x):
        return np.nan
    
    x = str(x).replace(" ", "")

    if "-" in x:
        parts = x.split("-")
        if len(parts) == 2 and parts[0].isdigit():
            return f"HK{parts[0]}-{parts[1]}"

    return x

survey["faculty_id"] = survey["faculty_id"].apply(clean_faculty_id)
survey["course_code"] = survey["course_code"].apply(clean_course_code)
survey["student_level"] = survey["student_level"].apply(clean_student_level)
survey["rating_score"] = survey["rating_score"].apply(clean_rating)
survey["survey_term"] = survey["survey_term"].apply(clean_term)

survey.head()


Unnamed: 0,survey_id,faculty_id,course_code,student_level,rating_score,survey_term
0,SRV1000,GV106,DA200,Nam 2,3.5,HK2024-2
1,SRV1001,GV120,CT102,Liên Thông,3.5,HK1-2024
2,SRV1002,GV111,CT101,Liên Thông,3.5,HK1-2024
3,SRV1003,GV123,CT101,K18,3.5,HK2024-2
4,SRV1004,GV105,DA200,Năm 1,3.5,HK2024-1


In [15]:
def clean_sentiment(x):
    if pd.isna(x):
        return np.nan

    x = str(x).lower().strip()

    if x in ["tichcuc", "positive", "pos", "tc"]:
        return "Tích cực"
    if x in ["tieucuc", "negative", "neg", "nc"]:
        return "Tiêu cực"
    if x in ["trunglap", "neutral", "tb"]:
        return "Trung lập"

    return x.title()

comments["sentiment_label"] = comments["sentiment_label"].apply(clean_sentiment)
comments["comment_text"] = comments["comment_text"].fillna("").str.strip()

comments.head()


Unnamed: 0,survey_id,comment_text,sentiment_label
0,SRV1038,Nội dung mon hoc kho,Tich Cuc
1,SRV1044,Thay co cham bai ky,Tich Cuc
2,SRV1079,Giang vien day de hieu,Tich Cuc
3,SRV1014,Bài giảng hơi nhanh,Tich Cuc
4,SRV1099,Nội dung mon hoc kho,Tich Cuc


In [16]:
survey = survey.dropna(subset=["rating_score"])

comments["sentiment_label"] = comments["sentiment_label"].fillna("Không xác định")


In [17]:
faculty.to_csv("../data_clean/faculty_info_clean.csv", index=False)
survey.to_csv("../data_clean/course_survey_clean.csv", index=False)
comments.to_csv("../data_clean/student_comments_clean.csv", index=False)
