## 匯入套件

In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import linear_model

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

## 導入 BOW資料

In [2]:
BagOfWord = pd.read_csv(r"..\data1\學長data\標記答案_最終答案\wordbag.csv", header=0,index_col=None)

BOW_X=BagOfWord .iloc[:, :-1]
BOW_Y=BagOfWord['answer']

## 導入 TF-IDF資料

#### TF-IDF 1000字資料導入

In [3]:
TFIDF_1000 = pd.read_csv(r"..\data1\學長data\標記答案_最終答案\TF-IDF(1000).csv", header=0,index_col=None)

TFIDF_1000_X = TFIDF_1000.iloc[:, :-1] # Features
TFIDF_1000_y = TFIDF_1000['answer'] # Target variable

#### TF-IDF 500字資料導入

In [4]:
TFIDF_500 = pd.read_csv(r"..\data1\學長data\標記答案_最終答案\TF-IDF(500).csv", header=0, index_col=None)

TFIDF_500_X = TFIDF_500.iloc[:, :-1] # Features
TFIDF_500_y = TFIDF_500['answer'] # Target variable

## Bag of words KNN

In [5]:
BOW_NB_acc=cross_val_score(NB,BOW_X,BOW_Y,cv=KF,scoring='accuracy')
print(f"\nAccuracy: {BOW_NB_acc.mean():.2%}")

BOW_NB_precision = cross_val_score(NB,BOW_X,BOW_Y,cv=KF,scoring='precision')
print(f"\nPrecision: {BOW_NB_precision.mean():.2%}")

BOW_NB_recall = cross_val_score(NB,BOW_X,BOW_Y,cv=KF,scoring='recall')
print(f"\nRecall: {BOW_NB_recall.mean():.2%}")

BOW_NB_f1_W = cross_val_score(NB,BOW_X,BOW_Y,cv=KF,scoring='f1_weighted')
print(f"\nF1 (weighted): {BOW_NB_f1_W.mean():.2%}")

BOW_NB_f1_Mi = cross_val_score(NB,BOW_X,BOW_Y,cv=KF,scoring='f1_micro')
print(f"\nF1 (micro): {BOW_NB_f1_Mi.mean():.2%}")

BOW_NB_f1_Ma = cross_val_score(NB,BOW_X,BOW_Y,cv=KF,scoring='f1_macro')
print(f"\nF1 (macro): {BOW_NB_f1_Ma.mean():.2%}")

BOW_NB_auc = cross_val_score(NB,BOW_X,BOW_Y,cv=KF,scoring='roc_auc')
print(f"\nAUC: {BOW_NB_auc.mean():.2%}")


Accuracy: 69.10%

Precision: 80.12%

Recall: 53.61%

F1 (weighted): 67.11%

F1 (micro): 69.13%

F1 (macro): 66.18%

AUC: 73.46%


## TF-IDF 1000字 KNN

In [6]:
TFIDF_1000_KNN_acc=cross_val_score(knn,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='accuracy')
print(f"\nAccuracy: {TFIDF_1000_KNN_acc.mean():.2%}")

TFIDF_1000_KNN_precision = cross_val_score(knn,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='precision')
print(f"\nPrecision: {TFIDF_1000_KNN_precision.mean():.2%}")

TFIDF_1000_KNN_recall = cross_val_score(knn,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='recall')
print(f"\nRecall: {TFIDF_1000_KNN_recall.mean():.2%}")

TFIDF_1000_KNN_f1_W = cross_val_score(knn,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='f1_weighted')
print(f"\nF1 (weighted): {TFIDF_1000_KNN_f1_W.mean():.2%}")

TFIDF_1000_KNN_f1_Mi = cross_val_score(knn,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='f1_micro')
print(f"\nF1 (micro): {TFIDF_1000_KNN_f1_Mi.mean():.2%}")

TFIDF_1000_KNN_f1_Ma = cross_val_score(knn,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='f1_macro')
print(f"\nF1 (macro): {TFIDF_1000_KNN_f1_Ma.mean():.2%}")

TFIDF_1000_KNN_auc = cross_val_score(knn,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='roc_auc')
print(f"\nAUC: {TFIDF_1000_KNN_auc.mean():.2%}")


Accuracy: 67.28%

Precision: 72.15%

Recall: 64.57%

F1 (weighted): 68.15%

F1 (micro): 67.37%

F1 (macro): 68.73%

AUC: 73.90%


## TF-IDF 500字 KNN

In [7]:
TFIDF_500_KNN_acc=cross_val_score(knn,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='accuracy')
print(f"\nAccuracy: {TFIDF_500_KNN_acc.mean():.2%}")

TFIDF_500_KNN_precision = cross_val_score(knn,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='precision')
print(f"\nPrecision: {TFIDF_500_KNN_precision.mean():.2%}")

TFIDF_500_KNN_recall = cross_val_score(knn,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='recall')
print(f"\nRecall: {TFIDF_500_KNN_recall.mean():.2%}")

TFIDF_500_KNN_f1_W = cross_val_score(knn,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='f1_weighted')
print(f"\nF1 (weighted): {TFIDF_500_KNN_f1_W.mean():.2%}")

TFIDF_500_KNN_f1_Mi = cross_val_score(knn,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='f1_micro')
print(f"\nF1 (micro): {TFIDF_500_KNN_f1_Mi.mean():.2%}")

TFIDF_500_KNN_f1_Ma = cross_val_score(knn,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='f1_macro')
print(f"\nF1 (macro): {TFIDF_500_KNN_f1_Ma.mean():.2%}")

TFIDF_500_KNN_auc = cross_val_score(knn,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='roc_auc')
print(f"\nAUC: {TFIDF_500_KNN_auc.mean():.2%}")


Accuracy: 68.36%

Precision: 74.51%

Recall: 59.37%

F1 (weighted): 68.86%

F1 (micro): 69.27%

F1 (macro): 67.90%

AUC: 74.02%
