In [1]:
import numpy as np
import pandas as pd

In [2]:
trainingData = pd.read_csv('training_twitter_x_y_train (1).csv')
testingData = pd.read_csv('test_twitter_x_test.csv')

In [3]:
trainingData.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


* The important of all the features of the data would be `airline_sentiment` and `text`

In [4]:
len(trainingData),len(testingData)

(10980, 3660)

#### Modification-1 
- As the test data is free of lables we keep a part of training data aside for testing

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
trainingDataX,trainTesterX,testingDataY,trainTesterY = train_test_split(trainingData['text'],trainingData['airline_sentiment'],
                                                                        random_state = 42,test_size = 0.1)

` Looking into the data of text and sentiments `

In [7]:
trainingData['text'][:10]

0    @SouthwestAir I am scheduled for the morning, ...
1    @SouthwestAir seeing your workers time in and ...
2    @united Flew ORD to Miami and back and  had gr...
3       @SouthwestAir @dultch97 that's horse radish 😤🐴
4    @united so our flight into ORD was delayed bec...
5    @united Why did you load us in this flying sar...
6    @JetBlue that is a stock response. Delays not ...
7    @JetBlue That'd be nice! Hoping to rack up eno...
8    @united frankly worse customer service ever. P...
9    @SouthwestAir yeah haha. Never been in one. It...
Name: text, dtype: object

In [8]:
trainingDataX = [i for i in trainingDataX]
testingDataY = [i for i in testingDataY]
trainTesterX = [i for i in trainTesterX]
trainTesterY = [i for i in trainTesterY]
testingDataX = testingData['text']

In [9]:
# Convert into the required NLTK format
trainDocument = []
for i in range(0,len(trainingDataX)) :
    trainDocument.append((trainingDataX[i],testingDataY[i]))
trainTester = []
for i in range(0,len(trainTesterX)) :
    trainTester.append((trainTesterX[i],trainTesterY[i]))
testDocument = []
for i in range(0,len(testingData)) :
    testDocument.append((testingDataX[i]))

#### Stop words

* Emoji stuff is managed by the `max_df` and `min_df`(document frequency) of `countVectorizer`
* Links are also managed by the same above

In [10]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')

In [11]:
#Add punctuations into the stop words
from string import punctuation
puncList = [char for char in punctuation]
stopWords += puncList

#### Sentence Cleaning
* We are using the lemmatization and removal of stop words
* `@` are only indicating the person so it has nothing to do with the sentimental view 

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [13]:
# Getting pos tag for lemmatization 
def getPos(category) :
    if category.startswith('N') :
        return 'n'
    elif category.startswith('J') :
        # Represents the adjective
        return 'a'
    elif category.startswith('R') :
        # Represents a adverb
        return 'r'
    elif category.startswith('V') :
        # Represnts a Verb
        return 'v'
    else : return 'n'

# Creating a funciton for word cleaning
def sentenceCleaning(doc) :
    text = doc[0]
    newWords = []
    lemmatizer = WordNetLemmatizer()
    for word in text.split() :
        if word.lower() not in stopWords and not word.startswith('@') :
            # Alse considering the links
            if not word.startswith('http') :
                posTag = pos_tag([word])
                word = lemmatizer.lemmatize(word,getPos(posTag[0][1]));
                newWords.append(word.lower())
    return newWords

In [14]:
trainingDocs = []
for i in range(0,len(trainDocument)) :
    modData = sentenceCleaning(trainDocument[i])
    trainingDocs.append((modData,trainDocument[i][1]))    

In [15]:
len(trainingDocs),len(trainingData)

(9882, 10980)

In [16]:
test = []
for i in range(0,len(trainTester)) :
    modData = sentenceCleaning(trainTester[i])
    test.append((modData,trainTester[i][1]))    

In [17]:
testingDocs = []
for i in range(0,len(testDocument)) :
    modData = sentenceCleaning([testDocument[i]])
    testingDocs.append((modData))

#### 2-Ways
* Convert the data into sklearn x and y format for easy acess to countVectorizer
* Go by the way of nltk.calssify
  - Here we can also use the nltk suit for sklearn calssifier to train

In [18]:
# Going by the first way we have

In [19]:
# Converting the data we have
X_train = [' '.join(doc[0]) for doc in trainingDocs]
y_train = testingDataY

In [20]:
X_trainTester = [' '.join(doc[0]) for doc in test]
y_trainTester = trainTesterY

In [21]:
X_test = [' '.join(doc) for doc in testingDocs]

#### CountVectorizer 
* Used to auto tokenize our data based on the number of `features` and also the values of `document frequencies`
* For the purpose of creating a sparse matrix of all the required info.
  - Tokenized
  - Featrue Selection
  - Ifnore words with least `TF*IDF`
  - Ignore words with high `df` and least `df`
  - Get the data in the format of tf

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
classifier = CountVectorizer(max_features=1000,
                             max_df=0.96,
                             min_df=0.001,
                             ngram_range=(1,3)
                         )

In [24]:
X_trainSparse = classifier.fit_transform(X_train)
X_testSparse = classifier.transform(X_test)

In [25]:
trainTesterSparse = classifier.transform(X_trainTester)

In [26]:
X_trainSparse.todense().shape

(9882, 1000)

In [27]:
X_train[:10]

['landed hour late flight ind den, 40+ min late flightr, bag here. cool.',
 'boarding process sucks. learn',
 '— exceptional service flight #403 ind phx!!',
 'frustrating days!no flight home, change airlines. thank phl usairway employee &amp; help get back iah.',
 'thanks. 25+ minute someone eventually help u take care it.',
 'would like 2 speak vp #custexp jimmy samartzis! sent survey/email awful flight exp',
 'awww man...but need buy ticket today home 1st march go this...or assist',
 'need help reservation',
 'understand wish would announce delay 2 hour earlier v sit 2 hr mco',
 'website let change online even though airline issue travel advisory, hold 50 minutes. help.']

#### Using Sklearn model - Random Forest
* Beauty is that we can use the sparse matrix directly into the models

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
classifier = RandomForestClassifier()

#### Grid Search CV 
* To get optimal values of the attributes we use the random values
  - In form of a grid


In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
grid = {'n_estimators' : [100,150,200]}

In [32]:
gridClassifier = GridSearchCV(classifier,grid)

In [33]:
gridClassifier.fit(X_trainSparse,y_train)

In [34]:
gridClassifier.score(X_trainSparse,y_train)

0.985832827362882

In [35]:
gridClassifier.best_params_

{'n_estimators': 200}

#### Checking the score of the classifier of testing Data from training Set

In [36]:
gridClassifier.score(trainTesterSparse,trainTesterY)

0.7367941712204007

In [37]:
predictions = gridClassifier.predict(X_testSparse)

In [38]:
predictions[0],testingDataX[0]

('negative',
 "@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?")

#### Using Sklearn Model - SVM

In [39]:
from sklearn.svm import SVC

In [40]:
svmClassifier = SVC()

In [41]:
grider = {'C' : [1e2,1e3,1e4],
       'gamma' : [1e-3,5e-4],}

In [42]:
gridClassifierSvm = GridSearchCV(svmClassifier,grider)

In [43]:
gridClassifierSvm.fit(X_trainSparse,y_train)

In [45]:
# Get the optimal parameretes
gridClassifierSvm.best_params_

{'C': 100.0, 'gamma': 0.0005}

In [46]:
gridClassifierSvm.score(trainTesterSparse,y_trainTester)

0.7531876138433515

In [47]:
predictions = gridClassifierSvm.predict(X_testSparse)

In [55]:
#Looking for few examples for validation
predictions[10],testingDataX[10]

('negative',
 '@united your announcement for pre boarding only addresses mobility. My disability requires me to travel with a lot of stuff. Do I preboard?')