In [59]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score, 
                             precision_score,
                             recall_score, 
                             f1_score)

from stopwordsiso import stopwords
import jieba

In [68]:
df_train = pd.read_csv("../data/informational_train.csv")
df_val = pd.read_csv("../data/informational_val.csv")
df_test = pd.read_csv("../data/informational_test.csv")

df_train

Unnamed: 0,text,label
0,他克莫司软膏（普特彼） 0.1%*10g 数量：1盒,1
1,家附近医院心内科先做好,0
2,出院小结能否拍照给我？,0
3,呃，有可能的，这个小朋友就这几个最常见的突然会加重的因素，脸上加重的话，这个食物如果身上还好...,1
4,我倾向良性概率大,1
...,...,...
11043,是的，需要。,0
11044,性激素六项和游离睾酮,0
11045,需要病人的具体检查号码，或者纸质报告的照片,0
11046,就是以前感染过呀,0


In [69]:
mnb = MultinomialNB()

In [70]:
X_train = df_train["text"]
y_train = df_train["label"]

X_val = df_val["text"]
y_val = df_val["label"]

X_test = df_test["text"]
y_test = df_test["label"]

In [71]:
stopwords_list = list(stopwords("zh"))

# Takes in a document, separates the words
def tokenize_zh(text):
    words = jieba.lcut(text)
    return words

vectorizer = TfidfVectorizer(tokenizer=tokenize_zh, stop_words=stopwords_list, min_df=30)

X_train = vectorizer.fit_transform(X_train).toarray()
X_val = vectorizer.transform(X_val).toarray()
X_test = vectorizer.transform(X_test).toarray()



In [72]:
len(X_train[0]), len(X_val[0]), len(X_test[0])

(546, 546, 546)

In [73]:
mnb.fit(X_train, y_train)

In [74]:
cv_scores = cross_val_score(mnb, X_val, y_val, cv = 10, scoring='accuracy')
cv_scores

array([0.73356401, 0.74740484, 0.68512111, 0.70934256, 0.76124567,
       0.70242215, 0.73263889, 0.70486111, 0.71527778, 0.72222222])

In [75]:
predictions = mnb.predict(X_test)

accuracy = accuracy_score(y_test,predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

accuracy, precision, recall, f1

(0.7484407484407485,
 0.8357685563997662,
 0.7626666666666667,
 0.7975460122699386)