In [13]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords

# 下载 NLTK 停用词
nltk.download('stopwords')

# 1. 加载数据
file_path = 'clean_data.csv'  # 替换为你的文件路径
clean_data = pd.read_csv(file_path)

# 2. 生成或保留 One-Hot 编码列
if "NEU_Colleges" in clean_data.columns:
    # 如果需要动态生成 One-Hot 编码列
    clean_data = pd.get_dummies(clean_data, columns=["NEU_Colleges"], prefix="NEU_Colleges", drop_first=True)
    print("Generated One-Hot encoding columns.")
else:
    # 检查是否已有 One-Hot 编码列
    one_hot_columns = [col for col in clean_data.columns if col.startswith("NEU_Colleges_")]
    if not one_hot_columns:
        raise ValueError("Neither the 'NEU_Colleges' column nor its One-Hot encoded columns were found!")

# 打印 One-Hot 编码列
print(f"One-Hot columns: {[col for col in clean_data.columns if col.startswith('NEU_Colleges_')]}")

# 3. 动态识别目标列（tags 和 comments）
actual_tags_column = [col for col in clean_data.columns if 'tag' in col.lower()]
actual_comments_column = [col for col in clean_data.columns if 'comment' in col.lower()]

# 确定目标列
target_columns = actual_tags_column + actual_comments_column
if not target_columns:
    raise ValueError("No columns related to 'tags' or 'comments' were found in the data!")

print(f"Identified target columns for preprocessing: {target_columns}")

# 4. 对目标列进行小写转换和预处理
def preprocess_text(text):
    text = text.lower()  # 转换为小写
    text = text.translate(str.maketrans('', '', string.punctuation))  # 移除标点符号
    stop_words = set(stopwords.words('english'))  # 定义停用词
    words = [word for word in text.split() if word not in stop_words]  # 去除停用词
    return ' '.join(words)

# 仅对目标列应用预处理
for col in target_columns:
    clean_data[col] = clean_data[col].astype(str).fillna('').apply(preprocess_text)

# 检查目标列的处理结果
print("\nAfter preprocessing target columns:")
print(clean_data[target_columns].head())

# 5. 保存最终结果
output_file = 'clean_data_with_tags_comments_and_one_hot.csv'
clean_data.to_csv(output_file, index=False)

print(f"\nProcessed data saved to '{output_file}'.")
clean_data.head()

[nltk_data] Downloading package stopwords to /Users/fandu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Generated One-Hot encoding columns.
One-Hot columns: ['NEU_Colleges_College of Arts, Media, and Design (CAMD)', 'NEU_Colleges_College of Engineering (COE)', 'NEU_Colleges_College of Professional Studies (CPS)', 'NEU_Colleges_College of Science (COS)', 'NEU_Colleges_College of Social Sciences and Humanities (CSSH)', 'NEU_Colleges_D’Amore-McKim School of Business (DMSB)', 'NEU_Colleges_Khoury College of Computer Sciences', 'NEU_Colleges_School of Law']
Identified target columns for preprocessing: ['Popular Tags']

After preprocessing target columns:
                                        Popular Tags
0  amazing lectures inspirational accessible outs...
1  tough grader participation matters group proje...
2         group projects lots homework lecture heavy
3  respected caring amazing lectures would take i...
4  get ready read hilarious lots homework tough g...

Processed data saved to 'clean_data_with_tags_comments_and_one_hot.csv'.


Unnamed: 0,First Name,Middle Name,Last Name,ID,Department,Institution Name,Institution ID,Number of Ratings,Average Rating (Out of 5),Would Take Again (Percent),...,Popular Tags,Reviews,"NEU_Colleges_College of Arts, Media, and Design (CAMD)",NEU_Colleges_College of Engineering (COE),NEU_Colleges_College of Professional Studies (CPS),NEU_Colleges_College of Science (COS),NEU_Colleges_College of Social Sciences and Humanities (CSSH),NEU_Colleges_D’Amore-McKim School of Business (DMSB),NEU_Colleges_Khoury College of Computer Sciences,NEU_Colleges_School of Law
0,Anand,,Asthagiri,2894891,Biomedical,Northeastern University,696,1.0,5.0,1.0,...,amazing lectures inspirational accessible outs...,Attendance Clarity (color) Easy (color) ...,False,False,False,True,False,False,False,False
1,Faizul,,Huq,2854044,Business,Northeastern University,696,7.0,2.6,0.43,...,tough grader participation matters group proje...,Attendance Clarity (color) Easy (color) ...,False,False,False,False,False,True,False,False
2,sakib,,miazi,2989886,Computer Science,Northeastern University,696,1.0,1.0,0.0,...,group projects lots homework lecture heavy,Attendance Clarity (color) Easy (color) ...,False,False,False,False,False,False,True,False
3,Mohammad,,Abderrazzaq,1268571,Arabic,Northeastern University,696,20.0,4.5,1.0,...,respected caring amazing lectures would take i...,Attendance Clarity (color) Easy (color) Hel...,False,False,False,False,True,False,False,False
4,Mehdi,,Abedi,2162443,Engineering,Northeastern University,696,21.0,3.5,0.62,...,get ready read hilarious lots homework tough g...,Attendance Clarity (color) Easy (color)...,False,True,False,False,False,False,False,False
