### Given Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets, your task is to predict the sentiment of the tweet i.e. positive, negative or neutral.

You are given:

1. A Training dataset csv file with X train and Y train data
2. A X test File and you have to predict and submit predictions for this file.

Read Instructions carefully -

1. Files are in csv format.
2. Submit a csv file with only predictions for X test data. File should not have any headers and should only have one column i.e. predictions. 
3. Submit your ipynb file as well.


In [1]:
sample_text = "Does this thing really work?"


# Movie Reviews

In [None]:
import nltk

In [None]:
from nltk.corpus import movie_reviews

In [None]:
movie_reviews.categories()

In [None]:
movie_reviews.fileids()

In [None]:
movie_reviews.words(movie_reviews.fileids()[5])

In [None]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

In [None]:
import random
random.shuffle(documents)
documents[0:5]

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
from nltk.corpus import wordnet

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
from nltk.corpus import stopwords
import string
stop = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stop.update(punctuations)
stop, string.punctuation

({'!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',


In [12]:
from nltk import pos_tag
w = 'better'
pos_tag([w])

[('better', 'RBR')]

In [13]:
# cleaning: stopwords and Lemmatization

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [14]:
documents =  [(clean_review(document), category) for document, category in documents]

In [15]:
documents[0]

(['think',
  'competent',
  'member',
  'human',
  'race',
  'ever',
  'see',
  'movie',
  '--',
  'movie',
  '--',
  'could',
  'probably',
  'predict',
  'every',
  'turn',
  'wedding',
  'singer',
  'even',
  'though',
  'try',
  'hard',
  'predict',
  'film',
  'watch',
  'especially',
  'romantic',
  'comedy',
  'plot',
  'particular',
  'film',
  'advanced',
  'second',
  'sometimes',
  'minute',
  'actually',
  'saw',
  'happen',
  'screen',
  'care',
  'even',
  'little',
  'reason',
  'simple',
  '1',
  'adam',
  'sandler',
  'great',
  '2',
  'drew',
  'barrymore',
  'great',
  '3',
  'movie',
  'funny',
  'part',
  'left',
  'gasp',
  'air',
  'laugh',
  'hard',
  'laugh',
  'hard',
  'movie',
  'since',
  'austin',
  'power',
  'sandler',
  'play',
  'robby',
  'hart',
  'wedding',
  'singer',
  'dream',
  'rock',
  'band',
  'write',
  'music',
  'opening',
  'scene',
  'seem',
  'like',
  'happy',
  'go',
  'lucky',
  'wedding',
  'singer',
  'least',
  'partially',
  'en

In [16]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [17]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [18]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]

In [19]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'also',
 'come',
 'bad',
 'two',
 'way',
 'give',
 '--',
 'look',
 'life',
 'seem',
 'end',
 'know',
 'first',
 'year',
 'work',
 'play',
 'thing',
 'plot',
 'really',
 'say',
 'little',
 'people',
 'show',
 'love',
 'star',
 'man',
 'could',
 'director',
 'never',
 'great',
 'try',
 'best',
 'new',
 'performance',
 'big',
 'actor',
 'many',
 'action',
 'u',
 'want',
 'role',
 'watch',
 'find',
 'think',
 'act',
 'back',
 'another',
 'turn',
 'audience',
 'world',
 'still',
 'something',
 'day',
 'old',
 'set',
 'however',
 'use',
 'feel',
 'comedy',
 'real',
 'guy',
 'begin',
 'part',
 'cast',
 'last',
 'every',
 'around',
 'point',
 'though',
 'run',
 'enough',
 'write',
 'young',
 'may',
 'interest',
 'right',
 'long',
 'funny',
 'actually',
 'script',
 'effect',
 'name',
 'minute',
 'woman',
 'almost',
 'place',
 'alt

In [20]:
documents[0]

(['think',
  'competent',
  'member',
  'human',
  'race',
  'ever',
  'see',
  'movie',
  '--',
  'movie',
  '--',
  'could',
  'probably',
  'predict',
  'every',
  'turn',
  'wedding',
  'singer',
  'even',
  'though',
  'try',
  'hard',
  'predict',
  'film',
  'watch',
  'especially',
  'romantic',
  'comedy',
  'plot',
  'particular',
  'film',
  'advanced',
  'second',
  'sometimes',
  'minute',
  'actually',
  'saw',
  'happen',
  'screen',
  'care',
  'even',
  'little',
  'reason',
  'simple',
  '1',
  'adam',
  'sandler',
  'great',
  '2',
  'drew',
  'barrymore',
  'great',
  '3',
  'movie',
  'funny',
  'part',
  'left',
  'gasp',
  'air',
  'laugh',
  'hard',
  'laugh',
  'hard',
  'movie',
  'since',
  'austin',
  'power',
  'sandler',
  'play',
  'robby',
  'hart',
  'wedding',
  'singer',
  'dream',
  'rock',
  'band',
  'write',
  'music',
  'opening',
  'scene',
  'seem',
  'like',
  'happy',
  'go',
  'lucky',
  'wedding',
  'singer',
  'least',
  'partially',
  'en

In [21]:
def get_feature_dict(words):
    current_features = {}
    word_set = set(words)
    for w in features:
        current_features[w] = w in word_set
    return current_features

In [22]:
output = get_feature_dict(training_documents[0][0])
output

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': False,
 'see': True,
 'go': True,
 'time': True,
 'well': True,
 'scene': True,
 'even': True,
 'good': True,
 'story': True,
 'take': True,
 'would': False,
 'much': True,
 'also': False,
 'come': False,
 'bad': False,
 'two': False,
 'way': True,
 'give': True,
 '--': True,
 'look': True,
 'life': True,
 'seem': True,
 'end': True,
 'know': True,
 'first': True,
 'year': False,
 'work': False,
 'play': True,
 'thing': False,
 'plot': True,
 'really': True,
 'say': True,
 'little': True,
 'people': False,
 'show': False,
 'love': True,
 'star': True,
 'man': True,
 'could': True,
 'director': False,
 'never': True,
 'great': True,
 'try': True,
 'best': True,
 'new': False,
 'performance': True,
 'big': False,
 'actor': False,
 'many': False,
 'action': False,
 'u': True,
 'want': True,
 'role': True,
 'watch': True,
 'find': False,
 'think': True,
 'act': True,
 'back': False,
 'ano

In [23]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [24]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': False,
  'see': True,
  'go': True,
  'time': True,
  'well': True,
  'scene': True,
  'even': True,
  'good': True,
  'story': True,
  'take': True,
  'would': False,
  'much': True,
  'also': False,
  'come': False,
  'bad': False,
  'two': False,
  'way': True,
  'give': True,
  '--': True,
  'look': True,
  'life': True,
  'seem': True,
  'end': True,
  'know': True,
  'first': True,
  'year': False,
  'work': False,
  'play': True,
  'thing': False,
  'plot': True,
  'really': True,
  'say': True,
  'little': True,
  'people': False,
  'show': False,
  'love': True,
  'star': True,
  'man': True,
  'could': True,
  'director': False,
  'never': True,
  'great': True,
  'try': True,
  'best': True,
  'new': False,
  'performance': True,
  'big': False,
  'actor': False,
  'many': False,
  'action': False,
  'u': True,
  'want': True,
  'role': True,
  'watch': True,
  'find

In [25]:
# classification using Nltk Naive Bayes Classifier

from nltk import NaiveBayesClassifier

In [26]:
classifier = NaiveBayesClassifier.train(training_data)

In [27]:
nltk.classify.accuracy(classifier, testing_data)

0.798

In [28]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     11.3 : 1.0
                 idiotic = True              neg : pos    =     10.1 : 1.0
             outstanding = True              pos : neg    =      9.1 : 1.0
                   anger = True              pos : neg    =      8.5 : 1.0
                   mulan = True              pos : neg    =      8.4 : 1.0
             magnificent = True              pos : neg    =      7.6 : 1.0
                  seagal = True              neg : pos    =      7.3 : 1.0
                  poorly = True              neg : pos    =      6.8 : 1.0
                    jude = True              pos : neg    =      6.4 : 1.0
                  martha = True              neg : pos    =      6.3 : 1.0
                    lame = True              neg : pos    =      6.1 : 1.0
                 unfunny = True              neg : pos    =      6.0 : 1.0
                   ideal = True              pos : neg    =      6.0 : 1.0

# Sklearn Classifiers

In [29]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [30]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [31]:
classifier_sklearn.train(training_data)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [32]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.756

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rfc = RandomForestClassifier()
classifier_sklearn = SklearnClassifier(rfc)

In [35]:
classifier_sklearn.train(training_data)



<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [36]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.654

In [37]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
classifier_sklearn = SklearnClassifier(dtc)

In [38]:
classifier_sklearn.train(training_data)

<SklearnClassifier(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))>

In [39]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.632

## CountVectorizer 

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
train_set = {"The sky is blue.", "The sun is bright"}
count_vect = CountVectorizer(max_features=3)   # max_features=n will return top n features present in the feature set
a = count_vect.fit_transform(train_set)
a.todense()

matrix([[1, 1, 1],
        [0, 1, 1]], dtype=int64)

In [42]:
'''
the above matrix is a 2D array which shows the presence of the top features in both the sets; 
to get the feature names of the top features detected use the following
'''

count_vect.get_feature_names()

['blue', 'is', 'the']

In [43]:
# let's modify the train set a little

train_set = {"The sky sky is blue.", "The sun is bright"}
count_vect = CountVectorizer(max_features=3)   # max_features=n will return top n features present in the feature set
a = count_vect.fit_transform(train_set)
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [44]:
count_vect.get_feature_names()

['is', 'sky', 'the']

### CountVectorizer is therefore, doing the tokenization picking up the best words, and then converting each document into a frequency array based upon the features you have chosen

In [45]:
categories = [category for document, category in documents]
text_documents = [" ".join(document) for document, category in documents]

In [46]:
text_documents[0]

'think competent member human race ever see movie -- movie -- could probably predict every turn wedding singer even though try hard predict film watch especially romantic comedy plot particular film advanced second sometimes minute actually saw happen screen care even little reason simple 1 adam sandler great 2 drew barrymore great 3 movie funny part left gasp air laugh hard laugh hard movie since austin power sandler play robby hart wedding singer dream rock band write music opening scene seem like happy go lucky wedding singer least partially enjoy profession rather good job talk really drunk individual steve buscemi decides brother wedding good place tell whole world life point utter pointlessness week later wedding left stand altar ex fianc e linda angela featherstone life complete disarray meet julia barrymore waitress engage glenn matthew glave marry three month julia robby become good friend enlists help wedding plan soon predictably becomes pretty clear glenn jerk robby julia r

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [49]:
count_vect = CountVectorizer(max_features=2000)   
x_train_features = count_vect.fit_transform(x_train)
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [50]:
count_vect.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '40',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accidentally',
 'accompany',
 'accomplish',
 'accord',
 'achieve',
 'across',
 'act',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'advance',
 'adventure',
 'affair',
 'affleck',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'ala',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anna',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone'

In [51]:
x_test_features = count_vect.transform(x_test)


In [52]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 88517 stored elements in Compressed Sparse Row format>

In [53]:
x_test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 1, 1, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)