In [1]:
# Libraries
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
# Reading the document with pandas reader
train = pd.read_csv('twitter analysis.csv')

We basically need to analyse the sentiments, so we do not require the other content except the tweets and their sentiments.

In [4]:
required_train_data = train[["airline_sentiment", "text"]]

### Gathering the required data in tuple form

In [5]:
data_format = []
tweets = required_train_data['text']
sentiments = required_train_data['airline_sentiment']
for i in range(len(required_train_data)):
    data_format.append((word_tokenize(tweets[i]), sentiments[i]))

### Removing the usernames from the tweet
We need to remove the usernames from the tweets, as the Usernames must not be a factor of a comment, but the tweet content should be. So, Most of the usernames are present in the starting, so I removed the starting trailing usernames, there are very few usernames which are in between the tweets, so they will be automatically filtered away.

In [6]:
for j in data_format:
    i = 0
    while(i < len(j[0])):
        if(j[0][0] == "@"):
            j[0].pop(0)
            j[0].pop(0)
        else:
            break

In [7]:
# Copy of the cleaned data(Cleaned Data with usernames)
partial_clean = data_format.copy()

### Stop Words

In [8]:
stop_words = stopwords.words('English')
punctuations = list(string.punctuation)
stops = stop_words + punctuations

### Major Data Cleaning
Cleaning the data includes:
1. Removind the stop words
2. Lemmatize the data with WordNetLemmatizer

In [9]:
# Get POS as the argument ready for WordNetLemmatizer function attribute lemmatize
def get_simple_pos(tag):
    if(tag.startswith('J')):
        return wordnet.ADJ
    elif(tag.startswith('V')):
        return wordnet.VERB
    elif(tag.startswith('N')):
        return wordnet.NOUN
    elif(tag.startswith('R')):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
# Cleaning functions, that checks the top words, lemmatize, and check if the word is alphabet or not. We do not include the
# digits and special characters.

def clean(words):
    output = [] # Storing the output cleaned words in the list
    for w in words: # Iterating thorugh each word
        if(w.lower() not in stops and w.isalpha() and w.lower() != "aa"): # Lowercase the word to check in stopwords and including only alphabets
            pos = pos_tag([w]) # Part of Speech
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][0])) # Lemmatizing the words, WordNetLemmatizer
            output.append(clean_word.lower()) # Storing the output
    return output # Returning cleaned data

In [11]:
cleaned_data = [(clean(text),sentiment) for text, sentiment in partial_clean] # Passing the data to above clean function

Great now, we have got our data cleaned up and ready to train. Let's split our data into training and testing so that we can test our data on the different prediction algorithms

In [12]:
sentiments = [sentiment for text, sentiment in cleaned_data]

In [13]:
tweets = [" ".join(tweet) for tweet, sentiment in cleaned_data]

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(tweets, sentiments)

### Training our data
As of now we have cleaned our data, split that into two parts (training and testing)

In [16]:
# Count vectorizer library
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
countVector = CountVectorizer(max_features = 2000, max_df = 0.8, ngram_range = (1, 2))
X_train = countVector.fit_transform(x_train)
X_test = countVector.transform(x_test)

In [22]:
# Using Random Forests Classifier
from sklearn.svm import SVC

In [29]:
clf = SVC(C = 3)
clf.fit(X_train, y_train)


SVC(C=3, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [30]:
clf.score(X_test, y_test)

0.7712204007285974

In [21]:
# import numpy as np
# np.savetxt('outputs.csv', result, fmt='%s')