In [55]:
import numpy as np
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import WordNetLemmatizer
from nltk import pos_tag
import nltk

In [49]:
training_data = pd.read_csv("train.csv")
testing_data = pd.read_csv("test.csv")
x_train = list(training_data['text'])
x_test = list(testing_data['text'])
y_train = list(training_data['airline_sentiment'])

In [50]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
lemmatizer = WordNetLemmatizer()

In [57]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_doc(doc):
    words = word_tokenize(doc)
    cleaned_words = []
    for word in words:
        if word.lower() not in stops and len(word)>2:
            pos = pos_tag([word])
            clean_word = lemmatizer.lemmatize(word, pos = get_simple_pos(pos[0][1]))
            cleaned_words.append(clean_word)
    return cleaned_words

In [58]:
train_docs = [clean_doc(doc) for doc in x_train]
test_docs = [clean_doc(doc) for doc in x_test]

In [59]:
all_words = []
for doc in train_docs:
    for word in doc:
        if(word.lower() not in stops):
            all_words.append(word)

dic = nltk.FreqDist(all_words)
common = dic.most_common(5000)
features = [i[0] for i in common]
features

['united',
 'flight',
 'AmericanAir',
 'USAirways',
 'SouthwestAir',
 'JetBlue',
 "n't",
 'get',
 'http',
 'hour',
 'Cancelled',
 '...',
 'service',
 'time',
 'customer',
 'help',
 'call',
 'bag',
 'Flight',
 'wait',
 'plane',
 'amp',
 'hold',
 'fly',
 'make',
 'need',
 'would',
 'Thanks',
 'one',
 'thanks',
 'day',
 'back',
 'try',
 'gate',
 'Flightled',
 'take',
 'airline',
 'say',
 'still',
 'VirginAmerica',
 'delayed',
 'seat',
 "'ve",
 'delay',
 'change',
 'like',
 'Late',
 'phone',
 'agent',
 'bad',
 'go',
 'guy',
 'today',
 'know',
 'ticket',
 'miss',
 'work',
 'book',
 'minute',
 'please',
 'give',
 'United',
 'airport',
 'Thank',
 'way',
 'could',
 'min',
 'lose',
 'check',
 "'re",
 'even',
 'home',
 'want',
 'travel',
 'tomorrow',
 'well',
 'thank',
 'weather',
 'see',
 'use',
 'last',
 'people',
 'email',
 'issue',
 'great',
 'told',
 "'ll",
 'sit',
 'never',
 'look',
 'due',
 'another',
 'luggage',
 'really',
 'number',
 'good',
 'trip',
 'Flighted',
 'hr',
 'let',
 'much',

In [62]:
def create_dict(doc):
    freq = nltk.FreqDist(doc)
    curr_features = {}
    for word in features:
        if word in freq:
            curr_features[word] = freq[word]
        else:
            curr_features[word] = 0
    return curr_features

In [64]:
training = [(create_dict(train_docs[i]),y_train[i]) for i in range(len(train_docs))]

In [68]:
testing = [create_dict(test_docs[i]) for i in range(len(test_docs))]

In [65]:
from nltk import NaiveBayesClassifier
clf = NaiveBayesClassifier.train(training)

In [70]:
y_pred = clf.classify_many(testing)
print(y_pred)

['negative', 'negative', 'negative', 'neutral', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'neutral', 'negative', 'negative', 'negative', 'positive', 'neutral', 'negative', 'negative', 'positive', 'neutral', 'negative', 'negative', 'negative', 'neutral', 'positive', 'negative', 'neutral', 'negative', 'neutral', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'neutral', 'negative', 'neutral', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'neutral', 'negative', 'negative', 'negative', 'neutral', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'neutral', 'negative', 'negative', 'negative', 'negative', 'ne

In [71]:
pd.DataFrame(y_pred).to_csv("submission file.csv")