In [1]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

# 下载NLTK数据（如果没有下载的话）
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# 初始化词形还原器和停用词列表
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# 准备输出文件夹
output_folder = 'preprocessed'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 定义文本预处理函数
def preprocess_text(input_text):
    # 去除特殊字符和数字
    cleaned_text = input_text.translate(str.maketrans('', '', string.punctuation + string.digits))
    
    # 分词
    word_tokens = word_tokenize(cleaned_text)
    
    # 词形还原
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in word_tokens]
    
    # 去除停用词
    filtered_tokens = [word for word in lemmatized_tokens if word.lower() not in stop_words]
    
    # 文本标准化（转换为小写）
    normalized_text = ' '.join(filtered_tokens).lower()
    
    return normalized_text

# 遍历所有txt文件进行预处理
input_folder = 'dataset'
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), 'r', encoding='ISO-8859-1') as file:
            text = file.read()
            preprocessed_text = preprocess_text(text)
            
            # 将预处理后的文本保存到新的文件夹下
            output_filename = os.path.join(output_folder, filename)
            with open(output_filename, 'w', encoding='utf-8') as output_file:
                output_file.write(preprocessed_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string


# 下载NLTK数据（如果没有下载的话）
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# 初始化词形还原器和停用词列表
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# 准备输出文件夹
output_folder = 'preprocessed'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    


# 定义文本预处理函数
def preprocess_text(input_text):
    stemmer = PorterStemmer()
    
    # 去除特殊字符和数字
    cleaned_text = input_text.translate(str.maketrans('', '', string.punctuation + string.digits))
    
    # 分词
    word_tokens = word_tokenize(cleaned_text)
    
    # 词干提取
    stemmed_tokens = [stemmer.stem(word) for word in word_tokens]
    
    # 去除停用词
    filtered_tokens = [word for word in stemmed_tokens if word.lower() not in stop_words]
    
    # 文本标准化（转换为小写）
    normalized_text = ' '.join(filtered_tokens).lower()
    
    return normalized_text


# 遍历所有txt文件进行预处理
input_folder = 'dataset'
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), 'r', encoding='ISO-8859-1') as file:
            text = file.read()
            preprocessed_text = preprocess_text(text)
            
            # 将预处理后的文本保存到新的文件夹下
            output_filename = os.path.join(output_folder, filename)
            with open(output_filename, 'w', encoding='utf-8') as output_file:
                output_file.write(preprocessed_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
