In [1]:
##### Sentiment Analysis

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# load movie reviews dataset
data = pd.read_csv('DMSC.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 10 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   ID             1048575 non-null  int64 
 1   Movie_Name_EN  1048575 non-null  object
 2   Movie_Name_CN  1048575 non-null  object
 3   Crawl_Date     1048575 non-null  object
 4   Number         1048575 non-null  int64 
 5   Username       1048505 non-null  object
 6   Date           1048575 non-null  object
 7   Star           1048575 non-null  int64 
 8   Comment        1048575 non-null  object
 9   Like           1048575 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 80.0+ MB


In [4]:
def make_label(star):
    if star > 3:
        return 1
    else:
        return 0
    
data['Sentiment'] = data.Star.apply(make_label)

In [5]:
import jieba

def chinese_word_cut(mytext):
    return " ".join(jieba.cut(mytext))

data['Cut_Comment'] = data.Comment.apply(chinese_word_cut)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/m3/fy9sk0m56rz89mjqdd9_9chh0000gn/T/jieba.cache
Loading model cost 0.826 seconds.
Prefix dict has been built successfully.


In [6]:
def get_custom_stopwords(stop_words_file):
    with open(stop_words_file) as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

stop_words_file = 'stop_words_chinese.txt'
stopwords = get_custom_stopwords(stop_words_file)

In [8]:

vect = CountVectorizer(max_df = 0.8, 
                       min_df = 3, 
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b', 
                       stop_words=frozenset(stopwords))

In [9]:
X = data['Cut_Comment']
y = data.Sentiment

def evaluate_model(X,Y,model, model_name, params):
    #Initialize the seed value to 1
    np.random.seed(1)
    
    #Split the data to test and train data
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
   
    #hyper-parameter tuning，using model and input params to choose the best model
    clf = GridSearchCV(model, params, cv=10)
    
    #Evaluate the test error using the best classifier and the test data
    clf.fit(X_train_vect, y_train)
    Score = clf.score(X_test_vect, y_test)
    Macro_F1_Score = f1_score(y_test, clf.predict(X_test_vect), average='macro')
    Micro_F1_Score = f1_score(y_test, clf.predict(X_test_vect), average='micro')
    CV_Score = cvres['mean_test_score'][clf.best_index_]  
    
    #9.6 return a dictionary
    d = {'Classifier': model_name, 'params':clf.best_estimator_,'Test Score': Score, \
         'CV Score':CV_Score, 'Macro F1 Score':Macro_F1_Score, 'Micro F1 Score':Micro_F1_Score}
    return d
    pass

In [10]:
def init_classifiers():
    return([(SVC(), model_names[0], param_grid_svc), 
            (LogisticRegression(), model_names[1], param_grid_logistic),
            (KNeighborsClassifier(), model_names[2], param_grid_knn),
            (MultinomialNB(), model_names[3], param_grid_nb),
            (DecisionTreeClassifier(), model_names[4], param_grid_tree),
            (RandomForestClassifier(), model_names[5], param_grid_rf),
            (AdaBoostClassifier(), model_names[6], param_grid_adaboost),
            (MLPClassifier(),model_names[7],param_grid_MLP)])

# 'model_names' contains the names  that we will use for the above classifiers
model_names = ['SVM','LR','KNN','NB','DecisionTree','RF','AdaBoost','MLP']

# the training parameters of each model
param_grid_svc = [{'C':[0.1,1],'kernel':['rbf','linear','poly','sigmoid'],'random_state':[1]}]
param_grid_logistic = [{'C':[0.1,1], 'penalty':['l1','l2'],'random_state':[1]}]
param_grid_knn = [{'n_neighbors':list(range(1,31))}]
param_grid_nb = [{}]
param_grid_tree = [{'random_state':[1]},{'criterion':['gini'], 'max_depth':list(range(10,100)), \
                                         'min_samples_split':[3,5],'random_state':[1]}]
param_grid_rf = [{'random_state':[1]},{'n_estimators':[50,70,100,150],'max_features':[0.2, 0.3], \
                                       'max_depth':list(range(10,100)),'bootstrap':[True],'random_state':[1]}]
param_grid_adaboost = [{'random_state':[1]},{'n_estimators':[50,70,100,150],'learning_rate':[0.1,1],\
                                             'random_state':[1]}]
param_grid_MLP = [{'hidden_layer_sizes':[100,200],'solver':['lbfgs', 'sgd', 'adam'],'random_state':[1],\
                   'activation':['identity', 'logistic', 'tanh', 'relu'],\
                   'learning_rate':['constant', 'invscaling', 'adaptive'],'alpha':list(range(0,1))}]



In [None]:
res_list = []
classifiers = init_classifiers()
for i in classifiers:
    results = evaluate_model(X, y, i[0], i[1], i[2])
    res_list.append(results)

df_model_comparison = pd.DataFrame(res_list).sort_values(['Classifier']).reset_index(drop=True)
df_model_comparison