### 匯入套件

In [1]:
import pandas as pd
import warnings

In [2]:
from sklearn.tree import DecisionTreeClassifier    #決策樹
from sklearn.ensemble import RandomForestClassifier#隨機森林
from sklearn.naive_bayes import GaussianNB         #天真貝氏
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn import linear_model                   #邏輯回歸
from sklearn import svm                            #SVM

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score

In [4]:
warnings.filterwarnings('ignore')

### 導入Bag of Words資料

In [5]:
BagOfWord = pd.read_csv(r"..\..\Feature_Array\Main_Features\Bag_of_Word.csv", header=0,index_col=None)

### 導入統計特徵資料(第一、第二、第三人稱代詞、留言長度)

In [6]:
Statistical_Features=pd.read_csv(r"..\..\Feature_Array\Extra_Features\Statistical_Features.csv", header=0, index_col=None)

### 導入10類詞性資料

In [7]:
POS_data=pd.read_csv(r"..\..\Feature_Array\Extra_Features\Simple_POS_data.csv", header=0, index_col=None)

### 導入情感特徵資料(正面情感字數、負面情感字數、情感分數、表情符號數、表情符號情感分數)

In [8]:
Sentiment=pd.read_csv(r"..\..\Feature_Array\Extra_Features\All_Sentiment.csv", header=0, index_col=None)

### 導入卡方霸凌詞資料

In [9]:
BullyWord=pd.read_csv(r"..\..\Feature_Array\Extra_Features\Chisquare_Bully_data.csv", header=0, index_col=None)

### X：Bag of Words、統計特徵(包含10類詞性)、情感特徵、卡方霸凌詞

In [10]:
X =pd.concat([BagOfWord.iloc[:, :-1],Statistical_Features.iloc[:, 1:],POS_data.iloc[:, :],Sentiment.iloc[:, 1:],BullyWord.iloc[:, 1:]], axis=1) # Features
Y = BagOfWord['answer'] # Target variable

### 宣告list 儲存評估指標結果

In [11]:
list_accuracy=[]
list_precision=[]
list_recall=[]
list_f1_Weight=[]
list_f1_Micro=[]
list_f1_Macro=[]
list_AUC=[]

### 決策樹

In [12]:
DT = DecisionTreeClassifier()
KF = KFold(n_splits=10, shuffle=True) 

In [13]:
DT_accuracy = cross_val_score(DT,X,Y,cv=KF,scoring='accuracy')
list_accuracy.append(round(DT_accuracy.mean(),4))

DT_precision = cross_val_score(DT,X,Y,cv=KF,scoring='precision')
list_precision.append(round(DT_precision.mean(),4))

DT_recall = cross_val_score(DT,X,Y,cv=KF,scoring='recall')
list_recall.append(round(DT_recall.mean(),4))

DT_f1_Weight = cross_val_score(DT,X,Y,cv=KF,scoring='f1_weighted')
list_f1_Weight.append(round(DT_f1_Weight.mean(),4))

DT_f1_Micro = cross_val_score(DT,X,Y,cv=KF,scoring='f1_micro')
list_f1_Micro.append(round(DT_f1_Micro.mean(),4))

DT_f1_Macro = cross_val_score(DT,X,Y,cv=KF,scoring='f1_macro')
list_f1_Macro.append(round(DT_f1_Macro.mean(),4))

DT_auc = cross_val_score(DT,X,Y,cv=KF,scoring='roc_auc')
list_AUC.append(round(DT_auc.mean(),4))

### 隨機森林

In [14]:
RF = RandomForestClassifier()
KF = KFold(n_splits=10, shuffle=True) 

In [15]:
RF_accuracy =cross_val_score(RF,X,Y,cv=KF,scoring='accuracy')
list_accuracy.append(round(RF_accuracy.mean(),4))

RF_precision = cross_val_score(RF,X,Y,cv=KF,scoring='precision')
list_precision.append(round(RF_precision.mean(),4))

RF_recall = cross_val_score(RF,X,Y,cv=KF,scoring='recall')
list_recall.append(round(RF_recall.mean(),4))

RF_f1_Weight = cross_val_score(RF,X,Y,cv=KF,scoring='f1_weighted')
list_f1_Weight.append(round(RF_f1_Weight.mean(),4))

RF_f1_Micro = cross_val_score(RF,X,Y,cv=KF,scoring='f1_micro')
list_f1_Micro.append(round(RF_f1_Micro.mean(),4))

RF_f1_Macro = cross_val_score(RF,X,Y,cv=KF,scoring='f1_macro')
list_f1_Macro.append(round(RF_f1_Macro.mean(),4))

RF_auc = cross_val_score(RF,X,Y,cv=KF,scoring='roc_auc')
list_AUC.append(round(RF_auc.mean(),4))

### 天真貝氏

In [16]:
NB=GaussianNB()
KF = KFold(n_splits=10, shuffle=True) 

In [17]:
NB_accuracy=cross_val_score(NB,X,Y,cv=KF,scoring='accuracy')
list_accuracy.append(round(NB_accuracy.mean(),4))

NB_precision = cross_val_score(NB,X,Y,cv=KF,scoring='precision')
list_precision.append(round(NB_precision.mean(),4))

NB_recall = cross_val_score(NB,X,Y,cv=KF,scoring='recall')
list_recall.append(round(NB_recall.mean(),4))

NB_f1_Weight = cross_val_score(NB,X,Y,cv=KF,scoring='f1_weighted')
list_f1_Weight.append(round(NB_f1_Weight.mean(),4))

NB_f1_Micro = cross_val_score(NB,X,Y,cv=KF,scoring='f1_micro')
list_f1_Micro.append(round(NB_f1_Micro.mean(),4))

NB_f1_Macro = cross_val_score(NB,X,Y,cv=KF,scoring='f1_macro')
list_f1_Macro.append(round(NB_f1_Macro.mean(),4))

NB_auc = cross_val_score(NB,X,Y,cv=KF,scoring='roc_auc')
list_AUC.append(round(NB_auc.mean(),4))

### 線性邏輯回歸

In [18]:
LR=linear_model.LogisticRegression()
KF = KFold(n_splits=10, shuffle=True) 

In [19]:
LR_accuracy=cross_val_score(LR,X,Y,cv=KF,scoring='accuracy')
list_accuracy.append(round(LR_accuracy.mean(),4))

LR_precision = cross_val_score(LR,X,Y,cv=KF,scoring='precision')
list_precision.append(round(LR_precision.mean(),4))

LR_recall = cross_val_score(LR,X,Y,cv=KF,scoring='recall')
list_recall.append(round(LR_recall.mean(),4))

LR_f1_Weight = cross_val_score(LR,X,Y,cv=KF,scoring='f1_weighted')
list_f1_Weight.append(round(LR_f1_Weight.mean(),4))

LR_f1_Micro = cross_val_score(LR,X,Y,cv=KF,scoring='f1_micro')
list_f1_Micro.append(round(LR_f1_Micro.mean(),4))

LR_f1_Macro = cross_val_score(LR,X,Y,cv=KF,scoring='f1_macro')
list_f1_Macro.append(round(LR_f1_Macro.mean(),4))

LR_auc = cross_val_score(LR,X,Y,cv=KF,scoring='roc_auc')
list_AUC.append(round(LR_auc.mean(),4))

### KNN

In [20]:
KNN = KNeighborsClassifier()
KF = KFold(n_splits=10, shuffle=True)

In [21]:
KNN_accuracy=cross_val_score(KNN,X,Y,cv=KF,scoring='accuracy')
list_accuracy.append(round(KNN_accuracy.mean(),4))

KNN_precision = cross_val_score(KNN,X,Y,cv=KF,scoring='precision')
list_precision.append(round(KNN_precision.mean(),4))

KNN_recall = cross_val_score(KNN,X,Y,cv=KF,scoring='recall')
list_recall.append(round(KNN_recall.mean(),4))

KNN_f1_Weight = cross_val_score(KNN,X,Y,cv=KF,scoring='f1_weighted')
list_f1_Weight.append(round(KNN_f1_Weight.mean(),4))

KNN_f1_Micro = cross_val_score(KNN,X,Y,cv=KF,scoring='f1_micro')
list_f1_Micro.append(round(KNN_f1_Micro.mean(),4))

KNN_f1_Macro = cross_val_score(KNN,X,Y,cv=KF,scoring='f1_macro')
list_f1_Macro.append(round(KNN_f1_Macro.mean(),4))

KNN_auc = cross_val_score(KNN,X,Y,cv=KF,scoring='roc_auc')
list_AUC.append(round(KNN_auc.mean(),4))

### SVM

In [22]:
SVM=svm.SVC()
KF = KFold(n_splits=10, shuffle=True) 

In [23]:
SVM_accuracy=cross_val_score(SVM,X,Y,cv=KF,scoring='accuracy')
list_accuracy.append(round(SVM_accuracy.mean(),4))

SVM_precision = cross_val_score(SVM,X,Y,cv=KF,scoring='precision')
list_precision.append(round(SVM_precision.mean(),4))

SVM_recall = cross_val_score(SVM,X,Y,cv=KF,scoring='recall')
list_recall.append(round(SVM_recall.mean(),4))

SVM_f1_Weight = cross_val_score(SVM,X,Y,cv=KF,scoring='f1_weighted')
list_f1_Weight.append(round(SVM_f1_Weight.mean(),4))

SVM_f1_Micro = cross_val_score(SVM,X,Y,cv=KF,scoring='f1_micro')
list_f1_Micro.append(round(SVM_f1_Micro.mean(),4))

SVM_f1_Macro = cross_val_score(SVM,X,Y,cv=KF,scoring='f1_macro')
list_f1_Macro.append(round(SVM_f1_Macro.mean(),4))

SVM_auc = cross_val_score(SVM,X,Y,cv=KF,scoring='roc_auc')
list_AUC.append(round(SVM_auc.mean(),4))

In [24]:
result_title=['Decision Tree','Random Forest','Gaussian Naive Bayes','Logistic regression','KNN','SVM']
result_dict={'特徵組合四_10_POS_卡方':result_title,'Accuracy':list_accuracy,'Precision':list_precision,'Recall':list_recall,'F1_Weight':list_f1_Weight,'F1_Micro':list_f1_Micro,'F1_Macro':list_f1_Macro,'AUC':list_AUC}

In [25]:
df = pd.DataFrame(result_dict).T

In [26]:
df.to_csv (r"..\..\結果\Word_Array+Feature_Crosses_4\特徵組合四_10_POS_卡方.csv", index = True, header=False,encoding='utf_8_sig')