In [1]:
import numpy as np
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
def read_file():
    '''
    read the training data file from local HD.
    :return: training data，labels
    '''
    with open("data/normal.txt", encoding="utf8") as normal_f, open("data/spam.txt", encoding="utf8") as spam_f:
        normal_data = normal_f.readlines()
        spam_data = spam_f.readlines()

        normal_label = np.ones(len(normal_data)).tolist()
        spam_label = np.zeros(len(spam_data)).tolist()

        corpus = normal_data + spam_data

        labels = normal_label + spam_label

    return corpus, labels

In [3]:
def divide_datasets(corpus, labels, test_data_proportion=0.3):
    '''
    :param corpus: training data
    :param labels: labels
    :param test_data_proportion:proportion of test data in the whole data set 
    :return: training data, test data, training labels, test labels
    '''
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                        test_size=test_data_proportion, random_state=10)
    return train_X, test_X, train_Y, test_Y

In [4]:
def bow_extractor(corpus, ngram_range=(1, 1)):
    '''
    extract features using Bow model.
    :return: the vectorizer and the extrated features.
    '''
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range,binary=False,tokenizer=jieba.lcut)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [5]:
def tfidf_extractor(corpus, ngram_range=(1, 1)):
    '''
    extract features using Tfidf model.
    :return: the vectorizer and the extrated features.
    '''
    vectorizer = TfidfVectorizer(min_df=1,norm='l2',smooth_idf=False,use_idf=True,ngram_range=ngram_range,tokenizer=jieba.lcut)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [6]:
def show_metrics(true_labels, predicted_labels):
    print('accuracy:', np.round(metrics.accuracy_score(true_labels,predicted_labels),4))
    print('precision:', np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),4))
    print('recall:', np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),4))
    print('F1:', np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),4))

In [7]:
def evaluate_model(classifier,train_features,train_labels,test_features,test_labels):
    classifier.fit(train_features,train_labels)
    predictions=classifier.predict(test_features)
    show_metrics(true_labels=test_labels,predicted_labels=predictions)
    return predictions

## Test Data Function
- The test data is in CSV format
- Reads the CSV and extracts the Chinese content
- Randomly selects 10 samples from the test set
- Displays the test samples before testing

In [8]:
import re
import random

# 定义处理函数，提取中文内容
def extract_chinese(text):
    pattern = r'[\u4e00-\u9fa5]+'
    chinese_only = ''.join(re.findall(pattern, text))
    return chinese_only



In [9]:
def final_test():

    # read test data
    df_test = pd.read_csv('./data/test.csv')
    print(f'test data shape: {df_test.shape}')

    # only keep chinese part
    df_test['processed_content'] = df_test['content'].apply(extract_chinese)

    # randomly select 10 samples
    sample_size = 10
    if len(df_test) < sample_size:
        sample_size = len(df_test)
        print(f'\ndatabase only have {len(df_test)} records, will use all')

    sample_indices = random.sample(range(len(df_test)), sample_size)
    sample_data = df_test.iloc[sample_indices]

    print(f'\nrandomly selected {sample_size} samples:')
    for i, (idx, row) in enumerate(sample_data.iterrows()):
        print(f'sample {i+1}:')
        print(row['processed_content'])
        print('-' * 50)

    selected_samples = sample_data['processed_content'].tolist()

    return selected_samples

In [10]:
def main():
    #prepare trainging data set
    corpus,labels=read_file()
    train_X, test_X, train_Y, test_Y=divide_datasets(corpus,labels)
    
    #blank1
    #convert training data into vectors in Bow
    bow_vectorizer, bow_train_features = bow_extractor(train_X)
    bow_test_features = bow_vectorizer.transform(test_X)
    
    #blank2
    #convert training data into vectors in Tfidf
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(train_X)
    tfidf_test_features = tfidf_vectorizer.transform(test_X)
    
    #construct the Naive Bayes classifier objectc for Bow and Tfidf
    mnb_bow = MultinomialNB()
    mnb_tfidf = MultinomialNB()
    
    #train the Bow model and evaluate it
    print("Naive Bayes classifier with BOW:")
    mnb_bow_predictions = evaluate_model(classifier=mnb_bow,
                                                       train_features=bow_train_features,
                                                       train_labels=train_Y,
                                                       test_features=bow_test_features,
                                                       test_labels=test_Y)
    
    #train the Tfidf model and evaluate it
    print("Naive Bayes classifier with Tfidf:")
    mnb_tfidf_predictions = evaluate_model(classifier=mnb_tfidf,
                                                       train_features=tfidf_train_features,
                                                       train_labels=train_Y,
                                                       test_features=tfidf_test_features,
                                                       test_labels=test_Y)

    #blank3
    #enter some new data to feel the accuracy of the program
    input = final_test()
    
    bow_input_features = bow_vectorizer.transform(input)
    tfidf_input_features = tfidf_vectorizer.transform(input)
    print('bow input predictions: ',mnb_bow.predict(bow_input_features))
    print('tfidf input predictions: ',mnb_tfidf.predict(tfidf_input_features))
    
main()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/07/3p41fxw138b_lz7hdy4spqjh0000gn/T/jieba.cache
Loading model cost 1.010 seconds.
Prefix dict has been built successfully.


Naive Bayes classifier with BOW:
accuracy: 0.9897
precision: 0.9897
recall: 0.9897
F1: 0.9897
Naive Bayes classifier with Tfidf:
accuracy: 0.9863
precision: 0.9864
recall: 0.9863
F1: 0.9863
test data shape: (12924, 1)

randomly selected 10 samples:
sample 1:
以下不能正确显示请点此
--------------------------------------------------
sample 2:
提你希望得到的工资水平呵呵这方面我也没有经验犯错了仅仅第二天下午我就接到了电话通知面试时间这次换了个声音很好听人也很客气之后的所有事情都是与他联系的可惜最后还是只闻其声未曾谋面面试就在几天后我上网看了看智联招聘的文章其中有很多英语面试问题集锦看的时候没过脑子面试的时候蔡发现怎么这么像啊当时就了没好好准备到了亮马大厦层发现已经有两个在等了着装都很随意其中一个穿短花裙上面好多褶子头发也没有梳理好还穿着很要命的肉色短丝袜
--------------------------------------------------
sample 3:
活动安排香山邮局周六晚三岔口西扎营小游戏睡觉周日自然醒早餐或午餐老望京植物园约下午活动级别费用自己的交通费以及气罐的均摊费用共约元左右食物和饮水早餐午餐周六晚上的小水最少吧背不动了倒掉背上去了洗脸刷牙也不用太省了装备要求帐篷报名时注明个数和大小炉头自备气罐费用头灯登山鞋长衣长裤等最好带登山杖打草惊蛇用一些常见药品请参见其他负重活动的帖子免责声明任何户外活动都存在风险参加者请熟读绿野公约常见问答里面有出了任何事情皆与他人无关秉承绿野自助精神活动中请不要随便帮助他人或轻易请求他人的帮助报名方式跟帖或私信报名可以不留电话但请注明性别帐篷和炉头气罐的情况加注只在扎营地或者在废弃窑洞生火其他时间禁止开火全程禁止吸烟请自带垃圾袋收走不可降解的个人垃圾迟到者自行追赶或单独行动返回参考链接
--------------------------------------------------
samp