## 匯入套件

In [2]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import linear_model

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

## 導入 BOW資料

In [18]:
BagOfWord = pd.read_csv(r"..\data1\學長data\標記答案_最終答案\wordbag.csv", header=0,index_col=None)

BOW_X=BagOfWord .iloc[:, :-1]
BOW_Y=BagOfWord['answer']

## 導入 TF-IDF資料

#### TF-IDF 1000字資料導入

In [27]:
TFIDF_1000 = pd.read_csv(r"..\data1\學長data\標記答案_最終答案\TF-IDF(1000).csv", header=0,index_col=None)

TFIDF_1000_X = TFIDF_1000.iloc[:, :-1] # Features
TFIDF_1000_y = TFIDF_1000['answer'] # Target variable

#### TF-IDF 500字資料導入

In [34]:
TFIDF_500 = pd.read_csv(r"..\data1\學長data\標記答案_最終答案\TF-IDF(500).csv", header=0, index_col=None)

TFIDF_500_X = TFIDF_500.iloc[:, :-1] # Features
TFIDF_500_y = TFIDF_500['answer'] # Target variable

## Bag of words KNN

In [39]:
DT = DecisionTreeClassifier()
KF = KFold(n_splits=10, shuffle=True) 

BOW_DT_acc = cross_val_score(DT,BOW_X,BOW_Y,cv=KF,scoring='accuracy')
print(f"\nAccuracy: {BOW_DT_acc.mean():.2%}")

BOW_DT_precision = cross_val_score(DT,BOW_X,BOW_Y,cv=KF,scoring='precision')
print(f"\nPrecision: {BOW_DT_precision.mean():.2%}")

BOW_DT_recall = cross_val_score(DT,BOW_X,BOW_Y,cv=KF,scoring='recall')
print(f"\nRecall: {BOW_DT_recall.mean():.2%}")

BOW_DT_f1_W = cross_val_score(DT,BOW_X,BOW_Y,cv=KF,scoring='f1_weighted')
print(f"\nF1 (weighted): {BOW_DT_f1_W.mean():.2%}")

BOW_DT_f1_Mi = cross_val_score(DT,BOW_X,BOW_Y,cv=KF,scoring='f1_micro')
print(f"\nF1 (micro): {BOW_DT_f1_Mi.mean():.2%}")

BOW_DT_f1_Ma = cross_val_score(DT,BOW_X,BOW_Y,cv=KF,scoring='f1_macro')
print(f"\nF1 (macro): {BOW_DT_f1_Ma.mean():.2%}")

BOW_DT_auc = cross_val_score(DT,BOW_X,BOW_Y,cv=KF,scoring='roc_auc')
print(f"\nAUC: {BOW_DT_auc.mean():.2%}")


Accuracy: 73.38%

Precision: 78.07%

Recall: 66.39%

F1 (weighted): 73.79%

F1 (micro): 74.06%

F1 (macro): 73.23%

AUC: 76.54%


## TF-IDF 1000字 決策樹

In [35]:
TFIDF_1000_DT_acc = cross_val_score(DT,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='accuracy')
print(f"\nAccuracy: {TFIDF_1000_DT_acc.mean():.2%}")

TFIDF_1000_precision = cross_val_score(DT,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='precision')
print(f"\nPrecion: {TFIDF_1000_precision.mean():.2%}")

TFIDF_1000_DT_recall = cross_val_score(DT,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='recall')
print(f"\nRecall: {TFIDF_1000_DT_recall.mean():.2%}")

TFIDF_1000_DT_f1_W = cross_val_score(DT,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='f1_weighted')
print(f"\nF1 (weighted): {TFIDF_1000_DT_f1_W.mean():.2%}")

TFIDF_1000_DT_f1_Mi = cross_val_score(DT,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='f1_micro')
print(f"\nF1 (micro): {TFIDF_1000_DT_f1_Mi.mean():.2%}")

TFIDF_1000_DT_f1_Ma = cross_val_score(DT,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='f1_macro')
print(f"\nF1 (macro): {TFIDF_1000_DT_f1_Ma.mean():.2%}")

TFIDF_1000_DT_auc = cross_val_score(DT,TFIDF_1000_X,TFIDF_1000_y,cv=KF,scoring='roc_auc')
print(f"\nAUC: {TFIDF_1000_DT_auc.mean():.2%}")


Accuracy: 71.87%

Precion: 74.51%

Recall: 68.21%

F1 (weighted): 72.47%

F1 (micro): 72.71%

F1 (macro): 72.36%

AUC: 75.66%


## TF-IDF 500字 決策樹

In [36]:
TFIDF_500_DT_acc = cross_val_score(DT,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='accuracy')
print(f"\nAccuracy: {TFIDF_500_DT_acc.mean():.2%}")

TFIDF_500_precision = cross_val_score(DT,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='precision')
print(f"\nPrecion: {TFIDF_500_precision.mean():.2%}")

TFIDF_500_DT_recall = cross_val_score(DT,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='recall')
print(f"\nRecall: {TFIDF_500_DT_recall.mean():.2%}")

TFIDF_500_DT_f1_W = cross_val_score(DT,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='f1_weighted')
print(f"\nF1 (weighted): {TFIDF_500_DT_f1_W.mean():.2%}")

TFIDF_500_DT_f1_Mi = cross_val_score(DT,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='f1_micro')
print(f"\nF1 (micro): {TFIDF_500_DT_f1_Mi.mean():.2%}")

TFIDF_500_DT_f1_Ma = cross_val_score(DT,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='f1_macro')
print(f"\nF1 (macro): {TFIDF_500_DT_f1_Ma.mean():.2%}")

TFIDF_500_DT_auc = cross_val_score(DT,TFIDF_500_X,TFIDF_500_y,cv=KF,scoring='roc_auc')
print(f"\nAUC: {TFIDF_500_DT_auc.mean():.2%}")


Accuracy: 71.24%

Precion: 76.15%

Recall: 64.54%

F1 (weighted): 71.11%

F1 (micro): 71.87%

F1 (macro): 71.29%

AUC: 75.21%
