In [1]:
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string as s
import numpy as np
import pandas as pd

In [2]:
# loading training and testing data
train = pd.read_csv("twitter_x_y_train.csv")
test = pd.read_csv("twitter_x_test.csv")

In [3]:
# printing first five training data
train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
train.describe(include='all')

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
count,10980.0,10980,10980,31,10980,24,10980.0,10980,776,10980,7430,7403
unique,,3,6,3,6438,11,,10851,632,10758,2658,78
top,,negative,United,negative,JetBlueNews,Customer Service Issue,,@united thanks,"[0.0, 0.0]",2015-02-24 11:43:05 -0800,"New York, NY",Eastern Time (US & Canada)
freq,,6851,2928,24,43,9,,6,131,3,125,2819
mean,5.692169e+17,,,,,,0.080965,,,,,
std,779543800000000.0,,,,,,0.740303,,,,,
min,5.675883e+17,,,,,,0.0,,,,,
25%,5.685584e+17,,,,,,0.0,,,,,
50%,5.694753e+17,,,,,,0.0,,,,,
75%,5.698902e+17,,,,,,0.0,,,,,


# For applying NLTK, we need only 'text' attibute as x-train dataset for training with 'airline_sentiment' attribute as y-train. Also 'text' attribute as x-test

In [5]:
x_train = train.pop("text")
y_train = train.pop("airline_sentiment")
x_test = test.pop("text")

In [6]:
# stopwords
stop = stopwords.words('english') + list(s.punctuation)

In [7]:
# lemmatization
lemmatizer = WordNetLemmatizer()

In [8]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('V'):
        return wordnet.VERB
    else:
        return wordnet.NOUN

In [9]:
def clean_words(sentence):
    output = []
    for w in word_tokenize(sentence):
        if w.lower() not in stop:
            pos_t = pos_tag(w)[0][1]
            clean_w = lemmatizer.lemmatize(w, pos=get_simple_pos(pos_t))
            output.append(clean_w.lower())
    return output

In [10]:
x_train = [clean_words(sent) for sent in x_train]
x_test = [clean_words(sent) for sent in x_test] 

In [11]:
x_train = [" ".join(w) for w in x_train]
x_test = [" ".join(w) for w in x_test]

In [12]:
# using Tfidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train_vec, y_train)
y_pred = clf.predict(x_test_vec)
print(clf.score(x_train_vec, y_train))

0.7819672131147541


In [18]:
np.savetxt("Predictions_twitter_sentiments.csv", y_pred, fmt="%s")