### Given Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets, your task is to predict the sentiment of the tweet i.e. positive, negative or neutral.

You are given:

1. A Training dataset csv file with X train and Y train data
2. A X test File and you have to predict and submit predictions for this file.

Read Instructions carefully -

1. Files are in csv format.
2. Submit a csv file with only predictions for X test data. File should not have any headers and should only have one column i.e. predictions. 
3. Submit your ipynb file as well.


In [1]:
sample_text = "Does this thing really work?"


# Movie Reviews

In [61]:
import nltk

In [62]:
from nltk.corpus import movie_reviews

In [63]:
movie_reviews.categories()

['neg', 'pos']

In [64]:
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [6]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [65]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [66]:
import random
random.shuffle(documents)
documents[0:5]

[(['"', 'we', 'are', 'grateful', 'that', 'we', 'have', ...], 'neg'),
 (['there', 'are', 'two', 'things', 'the', 'american', ...], 'neg'),
 (['mimi', 'leder', 'is', 'probably', 'best', 'known', ...], 'pos'),
 (['kevin', 'smith', 'is', 'like', 'a', 'big', 'kid', ...], 'pos'),
 (['veteran', 'actor', 'clint', 'eastwood', 'has', ...], 'pos')]

In [67]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [68]:
from nltk.corpus import wordnet

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
from nltk.corpus import stopwords
import string
stop = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stop.update(punctuations)
stop, string.punctuation

({'!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',


In [12]:
from nltk import pos_tag
w = 'better'
pos_tag([w])

[('better', 'RBR')]

In [69]:
# cleaning: stopwords and Lemmatization

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [70]:
documents =  [(clean_review(document), category) for document, category in documents]

KeyboardInterrupt: 

In [15]:
documents[0]

(['roman',
  'emperor',
  'marcus',
  'aurelius',
  'richard',
  'harris',
  'chooses',
  'trust',
  'general',
  'maximus',
  'russell',
  'crowe',
  'successor',
  'however',
  'emperor',
  'evil',
  'son',
  'commodus',
  'joaquin',
  'phoenix',
  'murder',
  'father',
  'announcement',
  'make',
  'maximus',
  'well',
  'family',
  'sentence',
  'execute',
  'maximus',
  'able',
  'escape',
  'executioner',
  'later',
  'capture',
  'sell',
  'slave',
  'proximo',
  'oliver',
  'reed',
  'proximo',
  'look',
  'gladiator',
  'earn',
  'money',
  'former',
  'gladiator',
  'maximus',
  'reluctantly',
  'us',
  'skill',
  'prove',
  'powerful',
  'gladiator',
  'emperor',
  'commodus',
  'announces',
  'new',
  'game',
  'held',
  'colisseum',
  'proximo',
  'take',
  'gladiator',
  'battle',
  'maximus',
  'see',
  'chance',
  'exact',
  'revenge',
  'commodus',
  'gladiator',
  'begin',
  'great',
  'battle',
  'sequence',
  'roman',
  'army',
  'germania',
  'actually',
  'able',


In [16]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [17]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [18]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]

In [19]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'also',
 'come',
 'bad',
 'look',
 'two',
 'know',
 'give',
 '--',
 'life',
 'first',
 'way',
 'end',
 'seem',
 'year',
 'work',
 'thing',
 'plot',
 'say',
 'really',
 'play',
 'little',
 'show',
 'people',
 'could',
 'man',
 'love',
 'star',
 'never',
 'great',
 'best',
 'try',
 'director',
 'new',
 'performance',
 'many',
 'big',
 'action',
 'watch',
 'actor',
 'u',
 'want',
 'role',
 'find',
 'audience',
 'think',
 'act',
 'world',
 'another',
 'something',
 'still',
 'turn',
 'back',
 'set',
 'old',
 'however',
 'day',
 'guy',
 'comedy',
 'use',
 'cast',
 'every',
 'part',
 'feel',
 'begin',
 'though',
 'around',
 'enough',
 'interest',
 'point',
 'write',
 'funny',
 'run',
 'real',
 'young',
 'may',
 'actually',
 'script',
 'last',
 'long',
 'fact',
 'name',
 'minute',
 'right',
 'woman',
 'friend',
 'place',
 'almos

In [20]:
documents[0]

(['roman',
  'emperor',
  'marcus',
  'aurelius',
  'richard',
  'harris',
  'chooses',
  'trust',
  'general',
  'maximus',
  'russell',
  'crowe',
  'successor',
  'however',
  'emperor',
  'evil',
  'son',
  'commodus',
  'joaquin',
  'phoenix',
  'murder',
  'father',
  'announcement',
  'make',
  'maximus',
  'well',
  'family',
  'sentence',
  'execute',
  'maximus',
  'able',
  'escape',
  'executioner',
  'later',
  'capture',
  'sell',
  'slave',
  'proximo',
  'oliver',
  'reed',
  'proximo',
  'look',
  'gladiator',
  'earn',
  'money',
  'former',
  'gladiator',
  'maximus',
  'reluctantly',
  'us',
  'skill',
  'prove',
  'powerful',
  'gladiator',
  'emperor',
  'commodus',
  'announces',
  'new',
  'game',
  'held',
  'colisseum',
  'proximo',
  'take',
  'gladiator',
  'battle',
  'maximus',
  'see',
  'chance',
  'exact',
  'revenge',
  'commodus',
  'gladiator',
  'begin',
  'great',
  'battle',
  'sequence',
  'roman',
  'army',
  'germania',
  'actually',
  'able',


In [21]:
def get_feature_dict(words):
    current_features = {}
    word_set = set(words)
    for w in features:
        current_features[w] = w in word_set
    return current_features

In [22]:
output = get_feature_dict(training_documents[0][0])
output

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': True,
 'see': True,
 'go': True,
 'time': True,
 'well': True,
 'scene': True,
 'even': True,
 'good': True,
 'story': False,
 'take': True,
 'would': True,
 'much': False,
 'also': True,
 'come': True,
 'bad': False,
 'look': True,
 'two': False,
 'know': True,
 'give': True,
 '--': False,
 'life': False,
 'first': False,
 'way': False,
 'end': True,
 'seem': True,
 'year': True,
 'work': True,
 'thing': False,
 'plot': False,
 'say': True,
 'really': False,
 'play': False,
 'little': False,
 'show': False,
 'people': True,
 'could': True,
 'man': False,
 'love': False,
 'star': True,
 'never': True,
 'great': True,
 'best': True,
 'try': False,
 'director': False,
 'new': True,
 'performance': True,
 'many': True,
 'big': False,
 'action': False,
 'watch': False,
 'actor': False,
 'u': False,
 'want': False,
 'role': True,
 'find': False,
 'audience': True,
 'think': False,
 'act': 

In [None]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [24]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': True,
  'see': True,
  'go': True,
  'time': True,
  'well': True,
  'scene': True,
  'even': True,
  'good': True,
  'story': False,
  'take': True,
  'would': True,
  'much': False,
  'also': True,
  'come': True,
  'bad': False,
  'look': True,
  'two': False,
  'know': True,
  'give': True,
  '--': False,
  'life': False,
  'first': False,
  'way': False,
  'end': True,
  'seem': True,
  'year': True,
  'work': True,
  'thing': False,
  'plot': False,
  'say': True,
  'really': False,
  'play': False,
  'little': False,
  'show': False,
  'people': True,
  'could': True,
  'man': False,
  'love': False,
  'star': True,
  'never': True,
  'great': True,
  'best': True,
  'try': False,
  'director': False,
  'new': True,
  'performance': True,
  'many': True,
  'big': False,
  'action': False,
  'watch': False,
  'actor': False,
  'u': False,
  'want': False,
  'role': True,


In [25]:
# classification using Nltk Naive Bayes Classifier

from nltk import NaiveBayesClassifier

In [26]:
classifier = NaiveBayesClassifier.train(training_data)

In [27]:
nltk.classify.accuracy(classifier, testing_data)

0.81

In [28]:
classifier.show_most_informative_features(15)

Most Informative Features
                   damon = True              pos : neg    =     14.9 : 1.0
                  seagal = True              neg : pos    =     11.1 : 1.0
               ludicrous = True              neg : pos    =     10.7 : 1.0
                 idiotic = True              neg : pos    =      9.9 : 1.0
                  alicia = True              neg : pos    =      9.1 : 1.0
                    zeta = True              neg : pos    =      8.4 : 1.0
                  spacey = True              pos : neg    =      8.3 : 1.0
               painfully = True              neg : pos    =      8.2 : 1.0
             outstanding = True              pos : neg    =      7.8 : 1.0
                  german = True              pos : neg    =      7.8 : 1.0
                  sloppy = True              neg : pos    =      7.3 : 1.0
              uninspired = True              neg : pos    =      7.1 : 1.0
                   mulan = True              pos : neg    =      6.9 : 1.0

# Sklearn Classifiers

In [29]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [30]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [31]:
classifier_sklearn.train(training_data)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [32]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.774

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rfc = RandomForestClassifier()
classifier_sklearn = SklearnClassifier(rfc)

In [35]:
classifier_sklearn.train(training_data)



<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [36]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.658

In [44]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
classifier_sklearn = SklearnClassifier(dtc)

In [45]:
classifier_sklearn.train(training_data)

<SklearnClassifier(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))>

In [46]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.61

## CountVectorizer 

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
train_set = {"The sky is blue.", "The sun is bright"}
count_vect = CountVectorizer(max_features=3)   # max_features=n will return top n features present in the feature set
a = count_vect.fit_transform(train_set)
a.todense()

matrix([[0, 1, 1],
        [1, 1, 1]], dtype=int64)

In [73]:
'''
the above matrix is a 2D array which shows the presence of the top features in both the sets; 
to get the feature names of the top features detected use the following
'''

count_vect.get_feature_names()

['blue', 'is', 'the']

In [74]:
# let's modify the train set a little

train_set = {"The sky sky is blue.", "The sun is bright"}
count_vect = CountVectorizer(max_features=3)   # max_features=n will return top n features present in the feature set
a = count_vect.fit_transform(train_set)
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [75]:
count_vect.get_feature_names()

['is', 'sky', 'the']

### CountVectorizer is therefore, doing the tokenization picking up the best words, and then converting each document into a frequency array based upon the features you have chosen

In [80]:
categories = [category for document, category in documents]
text_documents = [" ".join(document) for document, category in documents]

In [77]:
text_documents[0]

'" we are grateful that we have the songs of grace chan to comfort us . " starring lee kang - sheng , yang kuei - mei directed by tsai ming - liang written by tsai and yang ping - ying cinematography by liao peng - jung taiwan , 24 / 12 / 99 . the millenium approaches . an incessant downpour batters an unnamed city . sectors of the city are being sealed , quarantined due to the onset of a mysterious virus . the virus causes people to act like insects : they crawl about on all fours , hide from bright lights , huddle in damp corners . the water supply to the quarantined zones will be cut off in a week \' s time . residents are advised to evacuate the area , asap . this premise -- kafka by way of cronenberg -- is the background for the hole , which could have been a great movie , but , sadly , ends up as a waste of good ideas . the quick sketch above suggests a dark , absurd , hallucinatory near - future parable of life in the late 20th century ; in execution , however , it is nothing li

In [78]:
from sklearn.model_selection import train_test_split

In [81]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [84]:
count_vect = CountVectorizer(max_features=2000)   
x_train_features = count_vect.fit_transform(x_train)
a.todense()

matrix([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 8, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [85]:
count_vect.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '90',
 'ability',
 'able',
 'about',
 'above',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adams',
 'adaptation',
 'add',
 'added',
 'addition',
 'adds',
 'admit',
 'adult',
 'adults',
 'adventure',
 'affair',
 'affleck',
 'after',
 'again',
 'against',
 'age',
 'agent',
 'ago',
 'ahead',
 'air',
 'alan',
 'alex',
 'alien',
 'aliens',
 'alive',
 'all',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'amusing',
 'amy',
 'an',
 'and',
 'anderson',
 'angels',
 'angry',
 'animal',
 'animated',
 'animation',
 'annie',
 'annoying',
 'another',
 'answer',
 'anthony',
 'anti',
 'any',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apart

In [89]:
x_test_features = count_vect.transform(x_test)


In [88]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 111806 stored elements in Compressed Sparse Row format>

In [None]:
x_test_features.todense()