In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
import re
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.neural_network import MLPClassifier as cnn

In [2]:
data_path1 = '../dataset/Tweets-airline-sentiment.csv'
#data_path2 = '../dataset/labeledTrainData_head12000.tsv'

In [3]:
data = pd.read_csv(data_path1)

In [4]:
#data.head()

In [5]:
text = data['text']

In [6]:
label = data['airline_sentiment']

In [7]:
label_tags = label.unique()
print(label_tags)

['neutral' 'positive' 'negative']


In [8]:
#replace text label with one-hot-labels
new_label = []

In [9]:
for l in label:
    if l == label_tags[0]:
        new_label.append(0)
    elif l == label_tags[1]:
        new_label.append(1)
    else:
        new_label.append(2)

In [10]:
new_label = np.asarray(new_label)

In [12]:
#get rid of '@airline_company_name
new_text = []

In [13]:
for t in text:
    new_text.append(re.sub('^@\w+ *','', t))

In [14]:
new_text = np.asarray(new_text)

In [15]:
new_text.shape, new_label.shape

((14640,), (14640,))

# model

In [26]:
NB = MultinomialNB()
pc = Perceptron()
svm = LinearSVC()
lr = LogisticRegression()
random_forest  = rf()
KNN = knn(n_neighbors=3)
CNN = cnn()

# Unigram

In [17]:
UniVec = CountVectorizer(max_features = 500, ngram_range = (1,1))
uni = UniVec.fit_transform(new_text)

In [18]:
skf = StratifiedKFold(n_splits=5)


In [24]:
for clf in [NB, pc, svm, lr]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7301962457671689




0.7111350376695293
0.7618886777939342
0.7667370754555864


In [27]:
for clf in [KNN, CNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.49044845012281224
0.7411243931760308
0.7334056494059733


# Bigram

In [28]:
BiVec = CountVectorizer(max_features = 500, ngram_range = (2,2))
Bi = BiVec.fit_transform(new_text)
skf = StratifiedKFold(n_splits=5)

In [29]:
for clf in [NB, pc, svm, lr, KNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7301962457671689
0.7111350376695293




0.7618886777939342
0.7667370754555864
0.49044845012281224
0.7298525695649101


In [30]:
for clf in [CNN]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.743923727176688


# Uni&Bigram

In [31]:
MixVec = CountVectorizer(max_features = 500, ngram_range = (1,2))
Mix = BiVec.fit_transform(new_text)
skf = StratifiedKFold(n_splits=5)

In [32]:
for clf in [NB, pc, svm, lr, KNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7301962457671689
0.7111350376695293




0.7618886777939342
0.7667370754555864
0.49044845012281224
0.7326511330773402


In [33]:
for clf in [CNN]:
    acc = []
    for train_index, test_index in skf.split(uni, new_label):
        x_train,x_test = uni[train_index], uni[test_index]
        y_train, y_test = new_label[train_index], new_label[test_index]
        clf.fit(x_train, y_train)
        acc.append(clf.score(x_test, y_test))
    acc = np.asarray(acc)
    print(acc.mean())

0.7448794744777792
