In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB , GaussianNB ,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LassoCV

import warnings
warnings.filterwarnings('ignore')

In [15]:
df = pd.read_csv('0000000000002747_training_twitter_x_y_train.csv')
df = df[['text' , 'airline_sentiment' ]]

In [16]:
# Reformatting dataset to list of tuples, each tuple has 2 elements, 1st is list of words and 2nd is its category

documents = []
for i in range(len(df)):
    category = df.loc[i][1]
    words = word_tokenize(df.loc[i][0])
    documents.append((words , category))

In [17]:
# shuffling documents
random.shuffle(documents)

In [18]:
stops = stopwords.words('english') + list(string.punctuation)

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
# defining fuction to convert part of speech into simple part of speech so that wordLemmatizer can understand

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [21]:
# function to remove no.'s from the feature vocabulary

def NotNumber(w):
    for ch in w:
        if(ch>='0' and ch<='9'):
            return False
    return True

In [22]:
# defining function to make the list of words clear 

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops and NotNumber(w)==True:
            pos = pos_tag([w])                                 
            clean_word = lemmatizer.lemmatize(w, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [23]:
# Creating clear documents

start = time.time()
documents = [(clean_review(document), category) for document, category in documents]
end = time.time()
print("Cleaning time: ", end - start)

Cleaning time:  50.95126819610596


In [29]:
documents[0:3]

[(['usairways', 'great', 'still', 'help', '..'], 'negative'),
 (['stahhppp',
   'rt',
   'jetblue',
   'fleet',
   "'s",
   'fleek',
   'http',
   '//t.co/vhfjgneozo'],
  'neutral'),
 (['united',
   "'s",
   'told',
   'refund',
   "n't",
   'really',
   'makeup',
   'inconvenience',
   'cause',
   'miss',
   'meeting',
   '...'],
  'negative')]

In [24]:
# again reformatting dataset in the form of of  x and y, so as to feed it to sklearn classifiers

y_train = [category for document, category in documents]
X_train = [" ".join(document) for document, category in documents]

In [32]:
X_train[0:2]

['usairways great still help ..',
 "stahhppp rt jetblue fleet 's fleek http //t.co/vhfjgneozo"]

In [25]:
df1 = pd.read_csv('0000000000002747_test_twitter_x_test.csv')
df1 = df1[['text']]

In [26]:
# Reformatting dataset to list of tuples, each tuple has 2 elements, 1st is list of words and 2nd is its category

documents1 = []
for i in range(len(df1)):
    words1 = word_tokenize(df1.loc[i][0])
    documents1.append((words1))

In [27]:
# Creating clear documents

start = time.time()
documents1 = [clean_review(document) for document in documents1]
end = time.time()
print("Cleaning time: ", end - start)

Cleaning time:  15.65929126739502


In [28]:
documents1[0:3]

[['americanair',
  'car',
  'gng',
  'dfw',
  'pulled',
  'ago',
  'icy',
  'road',
  'on-hold',
  'aa',
  'since',
  'ca',
  "n't",
  'reach',
  'arpt',
  'wat'],
 ['americanair',
  'plane',
  '’',
  'land',
  'identical',
  'bad',
  'condition',
  'grk',
  'accord',
  'metars'],
 ['southwestair',
  'ca',
  "n't",
  'believe',
  'many',
  'pay',
  'customer',
  'left',
  'high',
  'dry',
  'reason',
  'flight',
  'cancelled',
  'flightlations',
  'monday',
  'bdl',
  'wow']]

In [33]:
X_test = [" ".join(document) for document in documents1]

In [34]:
X_test

["americanair car gng dfw pulled ago icy road on-hold aa since ca n't reach arpt wat",
 'americanair plane ’ land identical bad condition grk accord metars',
 "southwestair ca n't believe many pay customer left high dry reason flight cancelled flightlations monday bdl wow",
 'usairways legitimately say would rather driven cross country flown us airways',
 'americanair still response aa great job guy',
 'united developer fly tmrw morn min layover earlier flight layover move',
 'usairways hello anyone',
 'usairways husainhaqqani mr. husain u shld protest well one ur party member rehman malik delayed pia flight hour ..',
 "usairways likely flightaware say plane still durango n't depart",
 "americanair n't even give option hold .. say line busy plz try late flightr",
 'united announcement pre boarding address mobility disability require travel lot stuff preboard',
 'usairways really embarrass ask complimentary drink/snack detailed http amp argue',
 'southwestair passport time trip could st

In [35]:
tfidf = TfidfVectorizer()
X_train_t = tfidf.fit_transform(X_train)
X_test_t = tfidf.transform(X_test)

In [36]:
alg1 = LogisticRegression()

start = time.time()
alg1.fit(X_train_t , y_train)
end = time.time()
total_time1 = end - start

y_pred1 = alg1.predict(X_test_t)

print('accuracy : ', alg1.score(X_train_t , y_train))
print('time : ' , total_time1)

accuracy :  0.8697632058287796
time :  0.6876695156097412


In [51]:
y_test=alg1.predict(X_test_t)
np.savetxt('predictions.csv',y_test,fmt='%s')