In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize as wt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
from nltk import pos_tag
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re

In [17]:
raw_train = pd.read_csv('twitter_training.csv')
raw_test = pd.read_csv('twitter_testing.csv')
doc_train = np.array(raw_train['text'])
sent_train = np.array(raw_train['airline_sentiment'])

sentiment_list = ['negative', 'neutral', 'positive']

## Cleaning Data

In [3]:
train_doc = []
for i in range(len(doc_train)):
    train_doc.append([wt(doc_train[i]), sent_train[i]])
stops = stopwords.words('english') + list(string.punctuation)

In [7]:
lemmatizer = WordNetLemmatizer()
def get_simple_pos(tag): #creating simple tags to pass into the lemmatizer
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])                                 
            clean_word = lemmatizer.lemmatize(w, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

def label_y(data):
    if data == 'negative':
        return 0
    elif data == 'neutral':
        return 1
    else:
        return 2

In [5]:
lem_train = [(clean_review(document), category) for document, category in train_doc]

In [8]:
x_train = [" ".join(document) for document, category in lem_train]
train_lab = [category for document, category in lem_train]
y_train = [label_y(p) for p in train_lab]

## Preparing Test Data

In [9]:
doc_test = np.array(raw_test['text'])
test_doc = [wt(doc_train[i]) for i in range(len(doc_test))]
lem_test = [clean_review(doc) for doc in test_doc]

In [10]:
x_test = [' '.join(doc) for doc in lem_test]

In [11]:
#Using tfidf vectorizer to convert it for sklearn classifers

my_vec = TfidfVectorizer(max_features=2000, min_df = 0.001, max_df = 0.8)
x_train_features = my_vec.fit_transform(x_train)
train_x = x_train_features.todense()
x_test_features = my_vec.transform(x_test)
test_x = x_test_features.todense()

In [None]:
#clf1 = RandomForestClassifier(n_estimators=2000, n_jobs=-1)
#clf1.fit(x_train_features, y_train)
#bu = clf1.predict(x_test_features)

#np.savetxt('predictions.csv', bu, delimiter = ',', fmt = '%s')

In [20]:
clf2 = MultinomialNB()
clf2.fit(train_x, y_train)
number_pred = list(clf2.predict(test_x))
label_pred = [sentiment_list[i] for i in number_pred]
#accuracy = accuracy_score(clf2.predict(train_x), y_train)

np.savetxt('predicted.csv', label_pred, delimiter = ',', fmt = '%s')
