## Importing the Required Libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import pos_tag
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shubhamsharma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shubhamsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shubhamsharma/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shubhamsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/shubhamsharma/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Preparing Training Data

### Importing Training Data

In [2]:
df_train = pd.read_csv('training_twitter_x_y_train.csv')

In [3]:
df_train.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)


In [4]:
df_train = df_train[['text', 'airline_sentiment']]

In [5]:
training_data = df_train.values
training_data[2][1]

'positive'

### Spliiting the Tweet text into words using NLTK

In [6]:
tweets_train = []
for i in range(len(training_data)):
    tweets_train.append([word_tokenize(training_data[i][0]), training_data[i][1]])  

### Cleaning the Words using WordNetLemmatizer available in NLTK

In [7]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
lemmatizer = WordNetLemmatizer()
def clean_tweets(words):
    output_words = []
    for w in words:
        if w.isalpha():
            if w.lower() not in stops:
                pos = pos_tag([w])
                clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
                output_words.append(clean_word.lower())
    return output_words

In [10]:
for i in range(len(tweets_train)):
    tweets_train[i] = (clean_tweets(tweets_train[i][0]), tweets_train[i][1])

In [11]:
y_train = []
tweets = []
for tweet, sentiment in tweets_train:
    tweets.append(" ".join(tweet))
    y_train.append(sentiment)

### Using Count Vectorizer to get the X Train

In [12]:
count_vec = CountVectorizer(max_features=2000) # Tried using n grams but the accuracy was decreasing
x_train_features = count_vec.fit_transform(tweets)

## Prepaing Testing Data

In [13]:
df_test = pd.read_csv('test_twitter_x_test.csv')

In [14]:
testing_data = np.array(df_test['text'])

In [15]:
tweets_test = []
for t in testing_data:
    t = clean_tweets(word_tokenize(t))
    tweets_test.append(" ".join(t))

In [16]:
x_test_features = count_vec.transform(tweets_test)

## Performing Classification

### Support Vector Machine

In [17]:
svc = SVC()
svc.fit(x_train_features, y_train)

SVC()

In [18]:
y_pred_svm = svc.predict(x_test_features)

In [19]:
df = pd.DataFrame(y_pred_svm)
df.to_csv('Predictions_SVM.csv', index = False, header = False)

### Random Forest

In [20]:
rf = RandomForestClassifier()
rf.fit(x_train_features, y_train)

RandomForestClassifier()

In [21]:
y_pred_rf = rf.predict(x_test_features)

In [22]:
df = pd.DataFrame(y_pred_rf)
df.to_csv('Predictions_RF.csv', index = False, header = False)

### Multinomial Naive Bayes

In [23]:
mnv = MultinomialNB(alpha = 1)
mnv.fit(x_train_features, y_train)

MultinomialNB(alpha=1)

In [24]:
y_pred_mnv = mnv.predict(x_test_features)

In [25]:
df = pd.DataFrame(y_pred_mnv)
df.to_csv('Predictions_MNB.csv', index = False, header = False)

### Descision Tree

In [26]:
dt = tree.DecisionTreeClassifier()
dt.fit(x_train_features, y_train)

DecisionTreeClassifier()

In [27]:
y_pred_dt = dt.predict(x_test_features)

In [28]:
df = pd.DataFrame(y_pred_dt)
df.to_csv('Predictions_DT.csv', index = False, header = False)

The prediction output of all the above classifiers were tested in the Coding Ninjas Website. It was found that Multinomial Naive Bayes was performing the best among the above classifiers.