# Set Up

导入需要的模块

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import WordPunctTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import stats

读取训练集数据和测试集数据并定义分词器

In [2]:
TRAIN_FILE = '../data/processed_train.csv'
TEST_FILE = '../data/processed_test.csv'

train_data = pd.read_csv(TRAIN_FILE)
test_data = pd.read_csv(TEST_FILE)
word_tokenizer = WordPunctTokenizer()

def bigram_tokenize(text):
    bigrams = []
    unigrams = word_tokenizer.tokenize(text)
    for i in range(len(unigrams) - 1):
        bigrams.append('-'.join([unigrams[i], unigrams[i+1]]))
    return bigrams

简单查看分词情况

In [3]:
# count totally how many unique word in the whole train dataset and test dataset
print('TRAIN')
stats.displayStatsInfo(TRAIN_FILE)

print('\nTEST')
stats.displayStatsInfo(TEST_FILE)

TRAIN

Analysis Statistics
Tweets => Total: 10248, Positive: 1704, Neutral: 2213, Negative: 6331
User Mentions => Total: 11620, Avg: 1.1339, Max: 6
URLs => Total: 865, Avg: 0.0844, Max: 3
Emojis => Total: 262, Positive: 181, Negative: 81, Avg: 0.0256, Max: 3
Unigrams => Total: 10248, Unique: 10626, Avg: 20.7293, Max: 47, Min: 2
Bigrams => Total: 202186, Unique: 75383, Avg: 19.7293

TEST

Analysis Statistics
Tweets => Total: 4392, Positive: 659, Neutral: 886, Negative: 2847
User Mentions => Total: 4885, Avg: 1.1122, Max: 5
URLs => Total: 346, Avg: 0.0788, Max: 2
Emojis => Total: 89, Positive: 68, Negative: 21, Avg: 0.0203, Max: 2
Unigrams => Total: 4392, Unique: 7011, Avg: 21.1473, Max: 46, Min: 2
Bigrams => Total: 88487, Unique: 42577, Avg: 20.1473


定义训练集和测试集的标签/目标

In [4]:
train_targets = train_data.sentiment
test_targets = test_data.sentiment

# Naive Bayes

文本特征提取：词频向量化 + TF-IDF 处理

In [5]:
countVzer = CountVectorizer(min_df=7, max_df=0.5, tokenizer=word_tokenizer.tokenize)
train_counts = countVzer.fit_transform(train_data.text)
test_counts = countVzer.transform(test_data.text)
tfidfTfmer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
train_tfidfs = tfidfTfmer.fit_transform(train_counts)
test_tfidfs = tfidfTfmer.fit_transform(test_counts)
print('feature dimension: {}'.format(train_counts.shape[1]))

feature dimension: 1974


In [6]:
clf_NB = MultinomialNB(alpha=1.0e-10)
clf_NB.fit(train_counts, train_targets)
test_predicts = clf_NB.predict(test_counts)
print('Navie Bayes without Laplace Smoothing')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

Navie Bayes without Laplace Smoothing
              precision    recall  f1-score   support

    negative       0.82      0.87      0.84      2847
     neutral       0.54      0.49      0.51       886
    positive       0.69      0.62      0.65       659

   micro avg       0.75      0.75      0.75      4392
   macro avg       0.68      0.66      0.67      4392
weighted avg       0.75      0.75      0.75      4392

Accuracy: 0.7534153005464481


In [7]:
clf_NB = MultinomialNB(alpha=1.0)
clf_NB.fit(train_counts, train_targets)
test_predicts = clf_NB.predict(test_counts)
print('Navie Bayes with Laplace Smoothing')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

Navie Bayes with Laplace Smoothing
              precision    recall  f1-score   support

    negative       0.84      0.85      0.85      2847
     neutral       0.56      0.51      0.53       886
    positive       0.66      0.70      0.68       659

   micro avg       0.76      0.76      0.76      4392
   macro avg       0.69      0.69      0.69      4392
weighted avg       0.76      0.76      0.76      4392

Accuracy: 0.7607012750455373


In [8]:
clf_NB = MultinomialNB(alpha=1.0e-10)
clf_NB.fit(train_tfidfs, train_targets)
test_predicts = clf_NB.predict(test_tfidfs)
print('Navie Bayes without Laplace Smoothing (apply TF-IDF)')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

Navie Bayes without Laplace Smoothing (apply TF-IDF)
              precision    recall  f1-score   support

    negative       0.76      0.96      0.85      2847
     neutral       0.64      0.28      0.39       886
    positive       0.81      0.47      0.60       659

   micro avg       0.75      0.75      0.75      4392
   macro avg       0.74      0.57      0.61      4392
weighted avg       0.74      0.75      0.72      4392

Accuracy: 0.7513661202185792


In [9]:
clf_NB = MultinomialNB(alpha=1.0)
clf_NB.fit(train_tfidfs, train_targets)
test_predicts = clf_NB.predict(test_tfidfs)
print('Navie Bayes with Laplace Smoothing (apply TF-IDF)')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

Navie Bayes with Laplace Smoothing (apply TF-IDF)
              precision    recall  f1-score   support

    negative       0.74      0.98      0.84      2847
     neutral       0.72      0.21      0.33       886
    positive       0.84      0.44      0.58       659

   micro avg       0.74      0.74      0.74      4392
   macro avg       0.77      0.54      0.58      4392
weighted avg       0.75      0.74      0.70      4392

Accuracy: 0.7443078324225865


# SVM

In [10]:
countVzer = CountVectorizer(tokenizer=word_tokenizer.tokenize)
train_counts = countVzer.fit_transform(train_data.text)
test_counts = countVzer.transform(test_data.text)
tfidfTfmer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
train_tfidfs = tfidfTfmer.fit_transform(train_counts)
test_tfidfs = tfidfTfmer.fit_transform(test_counts)
print('feature dimension: {}'.format(train_counts.shape[1]))

feature dimension: 10626


In [11]:
clf_SVM = svm.LinearSVC(C=0.05)
clf_SVM.fit(train_counts, train_targets)
test_predicts = clf_SVM.predict(test_counts)
print('SVM with linear kernel')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

SVM with linear kernel
              precision    recall  f1-score   support

    negative       0.85      0.90      0.87      2847
     neutral       0.62      0.57      0.60       886
    positive       0.77      0.66      0.71       659

   micro avg       0.80      0.80      0.80      4392
   macro avg       0.75      0.71      0.73      4392
weighted avg       0.79      0.80      0.79      4392

Accuracy: 0.7971311475409836


In [12]:
clf_SVM = svm.LinearSVC(C=0.3)
clf_SVM.fit(train_tfidfs, train_targets)
test_predicts = clf_SVM.predict(test_tfidfs)
print('SVM with linear kernel (apply TF-IDF)')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

SVM with linear kernel (apply TF-IDF)
              precision    recall  f1-score   support

    negative       0.84      0.93      0.88      2847
     neutral       0.66      0.50      0.57       886
    positive       0.79      0.66      0.72       659

   micro avg       0.80      0.80      0.80      4392
   macro avg       0.76      0.70      0.72      4392
weighted avg       0.79      0.80      0.79      4392

Accuracy: 0.802367941712204


# KNN

In [13]:
countVzer = CountVectorizer(min_df=0, max_df=0.9, tokenizer=word_tokenizer.tokenize)
train_counts = countVzer.fit_transform(train_data.text)
test_counts = countVzer.transform(test_data.text)
tfidfTfmer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
train_tfidfs = tfidfTfmer.fit_transform(train_counts)
test_tfidfs = tfidfTfmer.fit_transform(test_counts)
print('feature dimension: {}'.format(train_counts.shape[1]))

feature dimension: 10625


In [14]:
from sklearn import neighbors

clf_KNN = neighbors.KNeighborsClassifier(n_neighbors=5)
clf_KNN.fit(train_counts, train_targets)
test_predicts = clf_KNN.predict(test_counts)
print('KNN-5')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

KNN-5
              precision    recall  f1-score   support

    negative       0.83      0.44      0.57      2847
     neutral       0.30      0.63      0.41       886
    positive       0.40      0.63      0.49       659

   micro avg       0.50      0.50      0.50      4392
   macro avg       0.51      0.56      0.49      4392
weighted avg       0.66      0.50      0.53      4392

Accuracy: 0.5034153005464481


In [15]:
clf_KNN = neighbors.KNeighborsClassifier(n_neighbors=5)
clf_KNN.fit(train_tfidfs, train_targets)
test_predicts = clf_KNN.predict(test_tfidfs)
print('KNN-5')
print(classification_report(test_targets, test_predicts))
print('Accuracy: {}'.format(accuracy_score(test_targets, test_predicts)))

KNN-5
              precision    recall  f1-score   support

    negative       0.79      0.83      0.81      2847
     neutral       0.45      0.38      0.41       886
    positive       0.60      0.61      0.60       659

   micro avg       0.70      0.70      0.70      4392
   macro avg       0.61      0.60      0.61      4392
weighted avg       0.69      0.70      0.70      4392

Accuracy: 0.7040072859744991
