In [2]:
import os
import pandas as pd
from tqdm.notebook import tqdm

## Baseline classification model (tfidf)

In [10]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

In [14]:
def baseline_model(X_TR, X_TE, Y_TR, Y_TE):
    
    X_TR = tfidf_vectorizer.fit_transform(X_TR)
    X_TE = tfidf_vectorizer.transform(X_TE)
    
    Models = ['Bernoulli NB','Multinomial NB','Svm (linear)','Logistic Regression',
              'Random Forest','kNN','Decision Tree','XG Boost']
    function = [BernoulliNB(),MultinomialNB(),svm.SVC(kernel="linear"),LogisticRegression(),
              RandomForestClassifier(),KNeighborsClassifier(),DecisionTreeClassifier(),
                XGBClassifier()]
    perform_f1 = []
    perform_acc = []
    
    
    for i in tqdm(range(len(function))):
        model = function[i]
        #performance = cross_val_score(model, tfidf_vectorizer.fit_transform(x), y, cv=10, scoring'accuracy')
        
        func = str(function[i])
        print("==== ", func[0:func.index('(')], " ====")

        model.fit(X_TR, Y_TR)
        model.score(X_TE, Y_TE)
        e = Y_TE
        p = model.predict(X_TE)
        print(metrics.classification_report(e,p))
        perform_f1.append(metrics.f1_score(e,p,average='macro'))
        perform_acc.append(metrics.accuracy_score(e,p))
        
    result_f1_table = pd.DataFrame({"Models":Models,"Result f1 scores":perform_f1})
    result_acc_table = pd.DataFrame({"Models":Models,"Result acc scores":perform_acc})
    return result_f1_table, result_acc_table

## Load files

In [5]:
# IMDB
path1 = 'C:/Users/doudi/Downloads/'
os.chdir(path1)

file1 = 'Imdb_Seg_no_stopword.csv'
imdb = pd.read_csv(file1)
imdb_train = imdb.iloc[0:25000,:]
imdb_test = imdb.iloc[25000:,:]

# PTT
file2 = 'PTT_movie_seg.csv'
PTT = pd.read_csv(file2)
PTT_train = PTT.iloc[0:2264,:]
PTT_test = PTT.iloc[2264:,:]

# RE
file3 = 'Reader_Emotion_Seg_no_stopword.csv'
RE = pd.read_csv(file3)
RE['concate'] = RE['title'] + RE['content']
RE_train = RE.iloc[0:11671,:]
RE_test = RE.iloc[11671:,:]

## Run models

In [13]:
tfidf_vectorizer.fit_transform(imdb_train['content']).shape

(25000, 74849)

In [15]:
IMDB_output = baseline_model(imdb_train['content'], imdb_test['content'], imdb_train['tag'], imdb_test['tag'])

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

====  BernoulliNB  ====
              precision    recall  f1-score   support

           0       0.79      0.88      0.84     12500
           1       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

====  MultinomialNB  ====
              precision    recall  f1-score   support

           0       0.79      0.89      0.84     12500
           1       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

====  SVC  ====
              precision    recall  f1-score   support

           0       0.88      0.89      0.88     12500
           1       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted



              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

====  RandomForestClassifier  ====




              precision    recall  f1-score   support

           0       0.70      0.82      0.75     12500
           1       0.78      0.64      0.70     12500

    accuracy                           0.73     25000
   macro avg       0.74      0.73      0.73     25000
weighted avg       0.74      0.73      0.73     25000

====  KNeighborsClassifier  ====
              precision    recall  f1-score   support

           0       0.64      0.72      0.68     12500
           1       0.68      0.60      0.64     12500

    accuracy                           0.66     25000
   macro avg       0.66      0.66      0.66     25000
weighted avg       0.66      0.66      0.66     25000

====  DecisionTreeClassifier  ====
              precision    recall  f1-score   support

           0       0.70      0.71      0.71     12500
           1       0.71      0.70      0.70     12500

    accuracy                           0.70     25000
   macro avg       0.70      0.70      0.70     25000
weight

In [16]:
PTT_output = baseline_model(PTT_train['content'], PTT_test['content'], 
                            PTT_train['label'], PTT_test['label'])

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

====  BernoulliNB  ====
              precision    recall  f1-score   support

           0       0.87      0.58      0.70      1132
           1       0.69      0.91      0.78      1132

    accuracy                           0.75      2264
   macro avg       0.78      0.75      0.74      2264
weighted avg       0.78      0.75      0.74      2264

====  MultinomialNB  ====
              precision    recall  f1-score   support

           0       0.87      0.70      0.78      1132
           1       0.75      0.90      0.82      1132

    accuracy                           0.80      2264
   macro avg       0.81      0.80      0.80      2264
weighted avg       0.81      0.80      0.80      2264

====  SVC  ====
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      1132
           1       0.87      0.88      0.88      1132

    accuracy                           0.88      2264
   macro avg       0.88      0.88      0.88      2264
weighted



              precision    recall  f1-score   support

           0       0.74      0.75      0.74      1132
           1       0.75      0.73      0.74      1132

    accuracy                           0.74      2264
   macro avg       0.74      0.74      0.74      2264
weighted avg       0.74      0.74      0.74      2264

====  KNeighborsClassifier  ====
              precision    recall  f1-score   support

           0       0.77      0.60      0.67      1132
           1       0.67      0.82      0.74      1132

    accuracy                           0.71      2264
   macro avg       0.72      0.71      0.71      2264
weighted avg       0.72      0.71      0.71      2264

====  DecisionTreeClassifier  ====
              precision    recall  f1-score   support

           0       0.70      0.71      0.70      1132
           1       0.70      0.69      0.70      1132

    accuracy                           0.70      2264
   macro avg       0.70      0.70      0.70      2264
weight

In [19]:
RE_output = baseline_model(RE_train['concate'].values.astype('U'), RE_test['concate'].values.astype('U'), RE_train['tag_Num'], RE_test['tag_Num'])

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

====  BernoulliNB  ====


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.45      0.29      0.35      4326
           1       0.00      0.00      0.00       261
           2       1.00      0.07      0.13      1473
           3       0.29      0.72      0.42      7344
           4       0.99      0.07      0.13      1526
           5       0.99      0.19      0.31      1573
           6       0.70      0.55      0.61     18266
           7       1.00      0.00      0.01       835

    accuracy                           0.48     35604
   macro avg       0.68      0.24      0.24     35604
weighted avg       0.63      0.48      0.47     35604

====  MultinomialNB  ====
              precision    recall  f1-score   support

           0       0.30      0.62      0.41      4326
           1       1.00      0.01      0.02       261
           2       0.56      0.98      0.71      1473
           3       0.46      0.45      0.45      7344
           4       0.72      0.99      0.84      152



              precision    recall  f1-score   support

           0       0.28      0.53      0.37      4326
           1       1.00      0.01      0.02       261
           2       0.38      0.98      0.54      1473
           3       0.46      0.35      0.39      7344
           4       0.45      0.99      0.61      1526
           5       0.49      0.99      0.66      1573
           6       0.77      0.46      0.58     18266
           7       0.97      0.73      0.83       835

    accuracy                           0.51     35604
   macro avg       0.60      0.63      0.50     35604
weighted avg       0.61      0.51      0.52     35604

====  RandomForestClassifier  ====




              precision    recall  f1-score   support

           0       0.28      0.24      0.26      4326
           1       0.58      0.96      0.73       261
           2       0.28      0.98      0.44      1473
           3       0.45      0.19      0.27      7344
           4       0.32      0.97      0.48      1526
           5       0.39      0.96      0.56      1573
           6       0.70      0.53      0.61     18266
           7       0.88      0.93      0.91       835

    accuracy                           0.50     35604
   macro avg       0.49      0.72      0.53     35604
weighted avg       0.56      0.50      0.49     35604

====  KNeighborsClassifier  ====
              precision    recall  f1-score   support

           0       0.27      0.42      0.33      4326
           1       0.19      0.50      0.28       261
           2       0.18      0.66      0.28      1473
           3       0.38      0.29      0.32      7344
           4       0.23      0.49      0.31  

In [24]:
RE_output

(                Models  Result f1 scores
 0         Bernoulli NB          0.244797
 1       Multinomial NB          0.578334
 2         Svm (linear)          0.577029
 3  Logistic Regression          0.500006
 4        Random Forest          0.530450
 5                  kNN          0.333972
 6        Decision Tree          0.457719
 7             XG Boost          0.263893,
                 Models  Result acc scores
 0         Bernoulli NB           0.477755
 1       Multinomial NB           0.564515
 2         Svm (linear)           0.494916
 3  Logistic Regression           0.514886
 4        Random Forest           0.496264
 5                  kNN           0.371672
 6        Decision Tree           0.410684
 7             XG Boost           0.418296)