In [30]:
import numpy as np
import pandas as pd
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
import time
from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv("training_twitter_x_y_train.csv")
df_t = pd.read_csv("test_twitter_x_test.csv")
len(df['tweet_id']),len(df_t['tweet_id'])

(10980, 3660)

In [4]:
categories = list(set(df['airline_sentiment']))
categories

['positive', 'neutral', 'negative']

In [5]:
df['text'][0]

'@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled'

In [6]:
len(df[df['airline_sentiment']=='negative']['text'])

6851

In [7]:
documents = []
x_test_doc =[]
for category in categories:
    for text in df[df['airline_sentiment']==category]['text'] :
        documents.append((word_tokenize(text),category))
for i in df_t['text']:
    x_test_doc.append(word_tokenize(i))
print(documents[0:3])

[(['@', 'SouthwestAir', 'seeing', 'your', 'workers', 'time', 'in', 'and', 'time', 'out', 'going', 'above', 'and', 'beyond', 'is', 'why', 'I', 'love', 'flying', 'with', 'you', 'guys', '.', 'Thank', 'you', '!'], 'positive'), (['@', 'united', 'Flew', 'ORD', 'to', 'Miami', 'and', 'back', 'and', 'had', 'great', 'crew', ',', 'service', 'on', 'both', 'legs', '.', 'THANKS'], 'positive'), (['@', 'JetBlue', 'That', "'d", 'be', 'nice', '!', 'Hoping', 'to', 'rack', 'up', 'enough', 'miles', 'to', 'take', 'a', 'trip', 'to', 'Seattle', 'and', 'enjoy', 'a', 'perfect', 'latte', 'in', 'the', 'city', 'of', 'coffee', '.'], 'positive')]


In [8]:
print(x_test_doc[0:5])

[['@', 'AmericanAir', 'In', 'car', 'gng', 'to', 'DFW', '.', 'Pulled', 'over', '1hr', 'ago', '-', 'very', 'icy', 'roads', '.', 'On-hold', 'with', 'AA', 'since', '1hr', '.', 'Ca', "n't", 'reach', 'arpt', 'for', 'AA2450', '.', 'Wat', '2', 'do', '?'], ['@', 'AmericanAir', 'after', 'all', ',', 'the', 'plane', 'didn', '’', 't', 'land', 'in', 'identical', 'or', 'worse', ')', 'conditions', 'at', 'GRK', 'according', 'to', 'METARs', '.'], ['@', 'SouthwestAir', 'ca', "n't", 'believe', 'how', 'many', 'paying', 'customers', 'you', 'left', 'high', 'and', 'dry', 'with', 'no', 'reason', 'for', 'flight', 'Cancelled', 'Flightlations', 'Monday', 'out', 'of', 'BDL', '!', 'Wow', '.'], ['@', 'USAirways', 'I', 'can', 'legitimately', 'say', 'that', 'I', 'would', 'have', 'rather', 'driven', 'cross', 'country', 'than', 'flown', 'on', 'US', 'Airways', '.'], ['@', 'AmericanAir', 'still', 'no', 'response', 'from', 'AA', '.', 'great', 'job', 'guys', '!']]


In [9]:
random.shuffle(documents)
random.shuffle(x_test_doc)
print(documents[0:3])

[(['@', 'USAirways', 'after', 'sitting', 'on', 'the', 'runway', 'for', '3', 'hours', 'i', 'had', 'to', 'leave', 'the', 'airport', 'w/o', 'my', 'luggage', '.', 'Is', 'this', 'ur', '1st', 'day', '?', '?'], 'negative'), (['@', 'JetBlue', '...', 'second', 'incident', 'of', 'lost', 'baggage', '.', 'I', 'sent', 'you', 'a', 'DM', '.', 'Thoughts', '?'], 'negative'), (['@', 'SouthwestAir', 'like', 'kelsey', 'said', ',', 'really', 'bad', 'spot', 'for', 'locals', '.', 'Do', 'you', 'know', 'how', 'long', 'it', 'takes', 'to', 'find', 'parking', 'then', 'walking', 'over', 'to', 'the', 'bellagio', '?'], 'negative')]


In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [12]:
stops = stopwords.words('english') + list(string.punctuation)

In [13]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])                                 
            clean_word = lemmatizer.lemmatize(w, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [14]:
start = time.time()
documents = [(clean_review(document), category) for document, category in documents]
end = time.time()
print("Cleaning time: ", end - start)

Cleaning time:  62.002241134643555


In [15]:
x_test_doc = [clean_review(i) for i in x_test_doc]
x_test_doc[0]

['united',
 'ua1740',
 'inbound',
 'flight',
 'dep',
 'time',
 'get',
 'closer',
 'push',
 'back',
 '15',
 'min',
 "'ve",
 '2',
 'hr']

In [16]:
documents[0]

(['usairways',
  'sit',
  'runway',
  '3',
  'hour',
  'leave',
  'airport',
  'w/o',
  'luggage',
  'ur',
  '1st',
  'day'],
 'negative')

In [17]:
x_train = [" ".join(document) for document, category in documents]

In [18]:
x_test = [" ".join(document) for document in x_test_doc]

In [19]:
x_test

["united ua1740 inbound flight dep time get closer push back 15 min 've 2 hr",
 'usairways nanceebing 4 hour hold time moment ... counting disgrace',
 'southwestair 3854 atl rdu snow forecast raleigh even',
 'americanair order always 9hour flight delayed 4 hour bad ever forgot meal',
 'united loyal premier platinum member lack communication really bad also ’ see take yet',
 'usairways already four time',
 'southwestair cancelled flight flight bna today needtogethome',
 "americanair cathaypacific cathaypacificus ca n't even tell country 😥😥",
 "southwestair first time fly 've definitely",
 'united thanks',
 'americanair hi phone number ring uk regard claim late flight flight departure claim amount agree',
 "southwestair 've hold 2 hour reschedule cancelled flightled flight morning give need help",
 "jetblue flight1407 's long day arrived london 11 morning",
 'southwestair carol thrower passenger fort lauderdale airport',
 'usairways horrible cust service exp dia today wife wait 3 hr plan

In [20]:
y_train = [category for document,category in documents]
y_train

['negative',
 'negative',
 'negative',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'positive',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'neutral',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'negative',
 'negative',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'nega

In [21]:
count_vec = CountVectorizer(max_features = 2000)
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
count_vec.get_feature_names()



['00',
 '000',
 '03',
 '05',
 '10',
 '100',
 '1000',
 '10pm',
 '11',
 '12',
 '13',
 '130',
 '14',
 '140',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1hr',
 '1k',
 '1st',
 '20',
 '200',
 '2014',
 '2015',
 '21',
 '22',
 '23',
 '24',
 '24hrs',
 '24th',
 '25',
 '26',
 '27',
 '28',
 '2day',
 '2days',
 '2hrs',
 '2nd',
 '2x',
 '30',
 '300',
 '3056',
 '30am',
 '30min',
 '30pm',
 '32',
 '35',
 '36',
 '39',
 '3am',
 '3hrs',
 '3rd',
 '40',
 '400',
 '42',
 '45',
 '47',
 '48',
 '4pm',
 '4th',
 '50',
 '500',
 '50pm',
 '51',
 '55',
 '58',
 '59',
 '5hrs',
 '5th',
 '60',
 '600',
 '6am',
 '70',
 '700',
 '719',
 '728',
 '737',
 '75',
 '777',
 '787',
 '7am',
 '80',
 '800',
 '8am',
 '8pm',
 '90',
 '99',
 'a320',
 'aa',
 'aadvantage',
 'abc',
 'ability',
 'able',
 'absolute',
 'absolutely',
 'absurd',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'accommodate',
 'accommodation',
 'accord',
 'account',
 'acct',
 'accurate',
 'across',
 'act',
 'action',
 'actual',
 'actually',
 'ad',
 'add',
 'additio

In [23]:
x_test_features = count_vec.transform(x_test)
x_test_features

<3660x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 31657 stored elements in Compressed Sparse Row format>

In [75]:
svc = SVC(kernel='rbf',C=0.05,gamma=0.0001)

In [76]:
svc.fit(x_train_features,y_train)

SVC(C=0.05, gamma=0.0001)

In [77]:
y_pred = svc.predict(x_test_features)

In [78]:
y_pred

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [79]:
np.savetxt('new8_svm_twitter_answers.csv',y_pred, delimiter=',',fmt='%s')

In [123]:
gnb = naive_bayes.GaussianNB() # GAUSSIAN NAIVE BAYES CLASSIFIER
gnb.fit(x_train_features.toarray(), y_train)

GaussianNB()

In [125]:
y_pred_naive = gnb.predict(x_test_features.toarray())

In [126]:
np.savetxt('naive_final_twitter_answers.csv',y_pred_naive, delimiter=',',fmt='%s')

In [128]:
lr = LogisticRegression()
lr.fit(x_train_features,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [129]:
y_pred_log = lr.predict(x_test_features)

In [130]:
np.savetxt('log_final_twitter_answers.csv',y_pred_log, delimiter=',',fmt='%s')

In [131]:
rf = RandomForestClassifier()
rf.fit(x_train_features,y_train)

RandomForestClassifier()

In [1]:
y_pred_forest = rf.predict(x_test_features)

NameError: name 'rf' is not defined

In [133]:
np.savetxt('forest_final_twitter_answers.csv',y_pred_forest, delimiter=',',fmt='%s')