In [1]:
import numpy as np
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
def read_file():
    '''
    read the training data file from local HD.
    :return: training data，labels
    '''
    with open("data/normal.txt", encoding="utf8") as normal_f, open("data/spam.txt", encoding="utf8") as spam_f:
        normal_data = normal_f.readlines()
        spam_data = spam_f.readlines()

        normal_label = np.ones(len(normal_data)).tolist()
        spam_label = np.zeros(len(spam_data)).tolist()

        corpus = normal_data + spam_data

        labels = normal_label + spam_label

    return corpus, labels

In [3]:
def divide_datasets(corpus, labels, test_data_proportion=0.3):
    '''
    :param corpus: training data
    :param labels: labels
    :param test_data_proportion:proportion of test data in the whole data set 
    :return: training data, test data, training labels, test labels
    '''
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                        test_size=test_data_proportion, random_state=10)
    return train_X, test_X, train_Y, test_Y

In [4]:
def bow_extractor(corpus, ngram_range=(1, 2)):
    '''
    extract features using Bow model.
    :return: the vectorizer and the extrated features.
    '''
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range,binary=False,tokenizer=jieba.lcut)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [5]:
def tfidf_extractor(corpus, ngram_range=(1, 2)):
    '''
    extract features using Tfidf model.
    :return: the vectorizer and the extrated features.
    '''
    vectorizer = TfidfVectorizer(min_df=1,norm='l2',smooth_idf=False,use_idf=True,ngram_range=ngram_range,tokenizer=jieba.lcut)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [6]:
def show_metrics(true_labels, predicted_labels):
    print('accuracy:', np.round(metrics.accuracy_score(true_labels,predicted_labels),4))
    print('precision:', np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),4))
    print('recall:', np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),4))
    print('F1:', np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),4))

In [7]:
def evaluate_model(classifier,train_features,train_labels,test_features,test_labels):
    classifier.fit(train_features,train_labels)
    predictions=classifier.predict(test_features)
    show_metrics(true_labels=test_labels,predicted_labels=predictions)
    return predictions

## Test Data Function
- The test data is in CSV format
- Reads the CSV and extracts the Chinese content
- Randomly selects 10 samples from the test set
- Displays the test samples before testing

In [8]:
import re
import random

# 定义处理函数，提取中文内容
def extract_chinese(text):
    pattern = r'[\u4e00-\u9fa5]+'
    chinese_only = ''.join(re.findall(pattern, text))
    return chinese_only



In [9]:
def final_test():

    # read test data
    df_test = pd.read_csv('./data/test.csv')
    print(f'test data shape: {df_test.shape}')

    # only keep chinese part
    df_test['processed_content'] = df_test['content'].apply(extract_chinese)

    # randomly select 10 samples
    sample_size = 10
    if len(df_test) < sample_size:
        sample_size = len(df_test)
        print(f'\ndatabase only have {len(df_test)} records, will use all')

    sample_indices = random.sample(range(len(df_test)), sample_size)
    sample_data = df_test.iloc[sample_indices]

    print(f'\nrandomly selected {sample_size} samples:')
    for i, (idx, row) in enumerate(sample_data.iterrows()):
        print(f'sample {i+1}:')
        print(row['processed_content'])
        print('-' * 50)

    selected_samples = sample_data['processed_content'].tolist()

    return selected_samples

In [10]:
def main():
    #prepare trainging data set
    corpus,labels=read_file()
    train_X, test_X, train_Y, test_Y=divide_datasets(corpus,labels)
    
    #blank1
    #convert training data into vectors in Bow
    bow_vectorizer, bow_train_features = bow_extractor(train_X)
    bow_test_features = bow_vectorizer.transform(test_X)
    
    #blank2
    #convert training data into vectors in Tfidf
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(train_X)
    tfidf_test_features = tfidf_vectorizer.transform(test_X)
    
    #construct the Naive Bayes classifier objectc for Bow and Tfidf
    mnb_bow = MultinomialNB()
    mnb_tfidf = MultinomialNB()
    
    #train the Bow model and evaluate it
    print("Naive Bayes classifier with BOW:")
    mnb_bow_predictions = evaluate_model(classifier=mnb_bow,
                                                       train_features=bow_train_features,
                                                       train_labels=train_Y,
                                                       test_features=bow_test_features,
                                                       test_labels=test_Y)
    
    #train the Tfidf model and evaluate it
    print("Naive Bayes classifier with Tfidf:")
    mnb_tfidf_predictions = evaluate_model(classifier=mnb_tfidf,
                                                       train_features=tfidf_train_features,
                                                       train_labels=train_Y,
                                                       test_features=tfidf_test_features,
                                                       test_labels=test_Y)

    #blank3
    #enter some new data to feel the accuracy of the program
    input = final_test()
    
    bow_input_features = bow_vectorizer.transform(input)
    tfidf_input_features = tfidf_vectorizer.transform(input)
    print('bow input predictions: ',mnb_bow.predict(bow_input_features))
    print('tfidf input predictions: ',mnb_tfidf.predict(tfidf_input_features))
    
main()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/07/3p41fxw138b_lz7hdy4spqjh0000gn/T/jieba.cache
Loading model cost 1.015 seconds.
Prefix dict has been built successfully.


Naive Bayes classifier with BOW:
accuracy: 0.995
precision: 0.995
recall: 0.995
F1: 0.995
Naive Bayes classifier with Tfidf:
accuracy: 0.99
precision: 0.9901
recall: 0.99
F1: 0.99
test data shape: (12924, 1)

randomly selected 10 samples:
sample 1:
您好很高兴也很感谢您能打开并阅读这封信也许从此我们会成为朋友至少是合作朋友也许会是有缘分的聊友我下面所讲述的事情会被很多人误解为不正当行为但也许您会有您独到的看法我的准则其实很简单不是欺骗只是想在友好互助的前提下寻求有缘的朋友首先这不是一个天上掉馅饼的好事付出将是辛苦的特别是初期收获当然也是同比例的其次这不带有任何虚假或者欺骗性其中的原委步骤环节都给每个人讲述得很清楚还有这也不是一个什么人都能做的一件事情最少您比较忙碌时间无暇顾及时不能从事不过即便您我无缘合作这件事也许对您的思想还是会有一定的启发或者帮助最后希望您能有分钟的时间初步了解和简单分析这件事加入这项网络活动您可以得到一个您自己的网络主页如果您暂时没有网页设计修改技术您可以原封不动这个主页如果您有这方面的技能那就可以按照您自己想像去修改这个主页还有您在网络展现您的思想才华的方式主要通过您的主页内容以及您能让多少人认知并参加这项网络活动当您做到简单的这两项之后这项活动便可以给您带来一定的物质财富这项财富收入也许没您现在的好但很可能是要远远好于您现在的情况如果您不是企业主或者商人那您以后会因为这项活动给你带来的惊喜而遗憾为何不早一点了解到这项信息当然您只看到这些的时候还是很不清楚这到底是怎么回事如果您还有继续了解的兴趣请到我的这个个人主页上去看看
--------------------------------------------------
sample 2:
实用操作即时全国职业经理人资格证书课程班年月日主办香港光华管理学院承办深圳市一二三管理咨询有限公司你的公司有这样的困惑吗你的管理层高效有力吗企业的竞争力是由占企业的管理人员产生的公司名誉主席巴尼维克甚至说成功是的战略加的执行没有一支高绩效的职业经理管理队伍缺乏