In [597]:
# importing the needed packages/modules/libraries

import numpy as np
import pandas as pd
import string
import random
import math
import nltk
from nltk import pos_tag, NaiveBayesClassifier, FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [598]:
# loading the tweets.csv file into a dataframe

df=pd.read_csv("tweets.csv")

In [599]:
# analyzing the head of the dataframe

df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [600]:
# getting the texts and the categories

texts=df["text"].values
categories=df["airline_sentiment"].values

In [601]:
# tokenizing each text

texts=[word_tokenize(text) for text in texts]

In [602]:
# making a combined documents array with each entry as a tuple of the text and its category

documents=[(texts[i], categories[i]) for i in range(len(texts))]

In [603]:
# getting the english stop words, punctuation marks, and numbers

stop_words=stopwords.words("english")

punctuation_marks=string.punctuation

numbers=np.arange(0, 10)

In [604]:
# converting the punctuation marks from string to an array 

punctuation_marks=[punctuation_mark for punctuation_mark in punctuation_marks]

In [605]:
# adding all the punctuation marks and numbers in the stop words

stop_words=np.concatenate((stop_words, punctuation_marks, numbers), axis=0)

In [606]:
# making a word net lemmatizer object

lemmatizer=WordNetLemmatizer()

In [607]:
# defining the names corresponding to the values needed by the pos argument of the lemmatize function

ADJ, ADV, VERB, NOUN=('a', 's', 'v', 'n')

In [608]:
# takes the pos tag and returns the value as needed by the pos argument of the lemmatize function

def get_simple_pos_tag(tag) :
    if(tag.startswith('J')) :
        return ADJ
    elif(tag.startswith('V')) :
        return VERB
    elif(tag.startswith('R')) :
        return ADV
    else :
        return NOUN

In [609]:
# returns a clean review of the words by removing the stop words and lemmatizing the remaining(to get to the corresponding root word)

def clean_words(words) :
    cleaned_words=[]

    pos_tag_values=pos_tag(words)

    for i in range(len(words)) :
        word=words[i]

        if(word.lower() not in stop_words) :
            number_present=False

            for char in word :
                if(char.isnumeric()) :
                    number_present=True

                    break

            if(number_present) :
                continue
            
            root_word=lemmatizer.lemmatize(word, pos=get_simple_pos_tag(pos_tag_values[i][1]))

            cleaned_words.append(root_word)

    return cleaned_words

In [610]:
# getting the cleaned data in the similar format like documents

cleaned_data=[]

for (words, category) in documents :
    cleaned_data.append((clean_words(words), category))

In [None]:
# doing a random shuffle on the cleaned data(although it is cleaned already)

random.shuffle(cleaned_data)

In [None]:
# defining the 75% mark for making the train and test split

total=len(cleaned_data)

limit_75=math.floor(total*0.75)

In [None]:
# splitting the data into train and test

train_data=cleaned_data[: limit_75]
test_data=cleaned_data[limit_75:]

In [None]:
# getting all the words in the training data

all_words=[]

for document in train_data :
    all_words+=document[0]

In [None]:
# findind the freq of each word and then choosing the top 2000 words with the highest frequencies

words_freqs=FreqDist(all_words)

top_words=words_freqs.most_common(2000)

In [None]:
# making the features from the top words

features=[word[0] for word in top_words]

In [None]:
# returns a feature dictionary corresponding to the words which tells whether a particular feature is present in the given set of words or not

def get_feature_dictionary(words) :
    feature_dict={}

    words_set=set(words) 

    for word in features :
        feature_dict[word]=(word in words_set)

    return feature_dict

In [None]:
# converting the train and test data into the format as required by the nltk classifiers

train_data=[(get_feature_dictionary(document), category) for (document, category) in train_data]
test_data=[(get_feature_dictionary(document), category) for (document, category) in test_data]

In [None]:
# making an nltk naive bayes classifier and training it

classifier=NaiveBayesClassifier.train(train_data)

In [None]:
# testing the classifier's on the test data

nltk.classify.accuracy(classifier, test_data)

0.7674863387978142

In [None]:
# modifiying the cleaned data to convert it to the format as required by the vectorizer

data_modified=np.array([(" ".join(document), category)  for (document, category) in cleaned_data])

In [None]:
# splitting the data into train and test(making sure to use the same limit, and the corresponding data will be the same as it has not been shuffled again)

x_train=data_modified[:, 0][: limit_75]
x_test=data_modified[:, 0][limit_75: ]

y_train=data_modified[:, 1][: limit_75]
y_test=data_modified[:, 1][limit_75: ]

In [None]:
# making an object of the tf-idf vectorizer(since we want to give consideration to the document frequency too) and we don't consider those words which occur in less than 5 documents and more than 15% of the documents

tfidf_vec=TfidfVectorizer(max_features=2000, min_df=5, max_df=0.15)

In [None]:
# fitting the training data into the vectorizer and getting the transformed training data

x_train_modified=tfidf_vec.fit_transform(x_train)

In [None]:
# transforming the testing data

x_test_modified=tfidf_vec.transform(x_test)

In [None]:
# making a linear svc classifier object

classifier=SVC(kernel="linear")

In [None]:
# fitting the classifier with the training data and testing it on the testing data

classifier.fit(x_train_modified, y_train)

classifier.score(x_test_modified, y_test)

0.7844262295081967