In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import csv
from sklearn import svm
import sklearn.feature_extraction.text
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
from sklearn import preprocessing
import re
import string
import graphviz
np.random.seed(2)

plt.ioff()
pd.set_option('display.max_columns', None)

## Download data from https://www.kaggle.com/crowdflower/twitter-airline-sentiment

In [2]:
data_frame = pd.read_csv('data/Tweets.csv')

In [3]:
print("{} Tweets in dataset".format(len(data_frame)))
data_frame.head()

14640 Tweets in dataset


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
tweets_text = data_frame['text']

In [5]:
print("Tweet: {}".format(tweets_text[16]))

Tweet: @VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo


In [6]:
tweets_text_clean = [u' starttoken ' + unicode(t, 'utf-8') + u' endtoken ' for t in tweets_text]

In [7]:
print("Tweet: {}".format(tweets_text_clean[16]))

Tweet:  starttoken @VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo endtoken 


In [8]:
def get_gram_dictionary_probabilities(ngram_size):
    vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size, ngram_size), 
                                                           strip_accents=False,
                                                          )
    vect.fit(tweets_text_clean)
    vocab = vect.get_feature_names()
    bow = vect.transform(tweets_text_clean)
    dist = np.sum(bow, axis=0)
    dist = [dist[0,di] for di in range(len(vocab))]
    ngram_freq = {}

    # Count ngram frequencies
    for tag, count in zip(vocab, dist):
        ngram_freq[tag] = count

    # create mapping from prior words -> potential words list
    preceding_gram_to_options_with_counts = {}
    for token in ngram_freq.keys():
        grams = token.split(' ')
        preceding = u' '.join(token.split(u' ')[:-1])
        next_token = token.split(u' ')[-1]
        if next_token != 'starttoken':
            options_so_far = preceding_gram_to_options_with_counts.get(preceding, [])
            options_so_far.append((next_token, ngram_freq[token]))
            preceding_gram_to_options_with_counts[preceding] = options_so_far

    preceding_gram_to_options_with_counts_normalized = {}
    for preciding_gram, options in preceding_gram_to_options_with_counts.iteritems():
        normalized_options = []
        total_counts = float(np.sum([count for option, count in options]))
        normalized_options = [(option, count/total_counts) for option, count in options]
        preceding_gram_to_options_with_counts_normalized[preciding_gram] = normalized_options
        
    return preceding_gram_to_options_with_counts_normalized

gram_5_dict_prob = get_gram_dictionary_probabilities(5)
gram_4_dict_prob = get_gram_dictionary_probabilities(4)
gram_3_dict_prob = get_gram_dictionary_probabilities(3)
gram_2_dict_prob = get_gram_dictionary_probabilities(2)
gram_1_dict_prob = get_gram_dictionary_probabilities(1)

gram_dict_probs = [(gram_5_dict_prob, 5), (gram_4_dict_prob, 4), (gram_3_dict_prob, 3), (gram_2_dict_prob, 2), (gram_1_dict_prob, 1)]

In [9]:
gram_3_dict_prob

{u'helpful we': [(u'can', 1.0)],
 u'shows delayed': [(u'trying', 0.5), (u'it', 0.5)],
 u'americanair hopefully': [(u'you', 0.66666666666666663),
  (u'it', 0.33333333333333331)],
 u'in route': [(u'now', 1.0)],
 u'completely unanticipated': [(u'endtoken', 1.0)],
 u'las she': [(u'also', 1.0)],
 u'hoom me': [(u'up', 1.0)],
 u'your whole': [(u'handling', 0.5), (u'organization', 0.5)],
 u'as offer': [(u'hotel', 1.0)],
 u'text message': [(u'received', 1.0)],
 u'denver just': [(u'got', 1.0)],
 u'thing we': [(u'noticed', 1.0)],
 u'to del': [(u'preferably', 1.0)],
 u'changes in': [(u'30', 0.33333333333333331),
  (u'the', 0.33333333333333331),
  (u'atlanta', 0.33333333333333331)],
 u'or just': [(u'stupid', 0.1111111111111111),
  (u'endtoken', 0.1111111111111111),
  (u'making', 0.1111111111111111),
  (u'days', 0.1111111111111111),
  (u'american', 0.1111111111111111),
  (u'showing', 0.1111111111111111),
  (u'when', 0.1111111111111111),
  (u'northeast', 0.1111111111111111),
  (u'money', 0.1111111111

In [10]:
# Produce text with variety
def produce_text_sampling(gram_dict_probs, minlen=5, maxlen=20, tokens_so_far=None, verbose=False):
    if tokens_so_far is None:
        tokens_so_far = [u'starttoken']
    finished = False
    while not finished:
        for gram_dict_prob, n_gram_len in gram_dict_probs:
            if verbose:
                print("Tokens so far: {}, trying {} grams".format(tokens_so_far, n_gram_len))
            next_token = produce_next_token_sampling(tokens_so_far, gram_dict_prob, minlen=minlen, n_gram_len=n_gram_len)
            if next_token:
                if next_token == u'endtoken' and len(tokens_so_far) > minlen:
                    continue
                tokens_so_far.append(next_token)
                break
        # finish if endtoken generated
        if tokens_so_far[-1] == u'endtoken':
            finished = True
        # finish if tweet getting long
        if len(tokens_so_far) > maxlen:
            finished = True
        # finish if issue with next token
        if next_token is None:
            finished = True

    # clean text of start/end tokens
    text = u" ".join([t for t in tokens_so_far if t not in [u'starttoken', u'endtoken']])
    return text

# Randomly Sample Probability Distribution
def sample(options, probs):
    chosen = np.random.choice(options, p=probs)
    return chosen

# Produce the next token
def produce_next_token_sampling(tokens_so_far, preceding_gram_to_options_with_counts_normalized, minlen, n_gram_len):
    last_bigram = u" ".join(tokens_so_far[-1*(n_gram_len-1):])
    if n_gram_len == 1:
        last_bigram = u''
    options = preceding_gram_to_options_with_counts_normalized.get(last_bigram, None)
    if not options:
        return None
    option_tokens, probabilities = [o[0] for o in options], [o[1] for o in options]
    next_token = sample(option_tokens, probs=probabilities)
    return next_token


In [11]:
tokens_so_far = []
produce_text_sampling(gram_dict_probs, verbose=True)

Tokens so far: [u'starttoken'], trying 5 grams
Tokens so far: [u'starttoken'], trying 4 grams
Tokens so far: [u'starttoken'], trying 3 grams
Tokens so far: [u'starttoken'], trying 2 grams
Tokens so far: [u'starttoken', u'americanair'], trying 5 grams
Tokens so far: [u'starttoken', u'americanair'], trying 4 grams
Tokens so far: [u'starttoken', u'americanair'], trying 3 grams
Tokens so far: [u'starttoken', u'americanair', u'oh'], trying 5 grams
Tokens so far: [u'starttoken', u'americanair', u'oh'], trying 4 grams
Tokens so far: [u'starttoken', u'americanair', u'oh', u'already'], trying 5 grams
Tokens so far: [u'starttoken', u'americanair', u'oh', u'already', u'have'], trying 5 grams
Tokens so far: [u'starttoken', u'americanair', u'oh', u'already', u'have', u'turned'], trying 5 grams
Tokens so far: [u'starttoken', u'americanair', u'oh', u'already', u'have', u'turned', u'itover'], trying 5 grams
Tokens so far: [u'starttoken', u'americanair', u'oh', u'already', u'have', u'turned', u'itover'

u'americanair oh already have turned itover to them but apparently losing someones bag on their honeymoon doesn require accountability hope'

In [12]:
produce_text_sampling(gram_dict_probs, tokens_so_far=[u'starttoken', u'united'], verbose=True)

Tokens so far: [u'starttoken', u'united'], trying 5 grams
Tokens so far: [u'starttoken', u'united'], trying 4 grams
Tokens so far: [u'starttoken', u'united'], trying 3 grams
Tokens so far: [u'starttoken', u'united', u'after'], trying 5 grams
Tokens so far: [u'starttoken', u'united', u'after'], trying 4 grams
Tokens so far: [u'starttoken', u'united', u'after', u'waiting'], trying 5 grams
Tokens so far: [u'starttoken', u'united', u'after', u'waiting', u'for'], trying 5 grams
Tokens so far: [u'starttoken', u'united', u'after', u'waiting', u'for', u'over'], trying 5 grams
Tokens so far: [u'starttoken', u'united', u'after', u'waiting', u'for', u'over', u'an'], trying 5 grams
Tokens so far: [u'starttoken', u'united', u'after', u'waiting', u'for', u'over', u'an', u'hour'], trying 5 grams
Tokens so far: [u'starttoken', u'united', u'after', u'waiting', u'for', u'over', u'an', u'hour', u'still'], trying 5 grams
Tokens so far: [u'starttoken', u'united', u'after', u'waiting', u'for', u'over', u'an

u'united after waiting for over an hour still don know where my bags are is also equally irritating coltsmissingbags time'

In [13]:
for ti in range(50):
    print("Tweet", ti, produce_text_sampling(gram_dict_probs))

('Tweet', 0, u'americanair usairways day has come where was suppose to get my money back but now prices are jacked up the')
('Tweet', 1, u'southwestair okay just signed up for trueblue and booked flight but keep getting an error message and only machine when')
('Tweet', 2, u'jetblue unhappy with hour delayed flight missed dinner tonight hr delay and now 20 min on tarmac worst experience ever')
('Tweet', 3, u'on the brink of bankruptcy jetblue our fleet on fleek http co 58b7swrpmq let see how things roll out in')
('Tweet', 4, u'united if your policy changed you must advise at the time of purchase same if you choose to vary it')
('Tweet', 5, u'united thanks for reminding me why switched to delta for coming in clutch and finally taking me home and despite')
('Tweet', 6, u'southwestair traveling with 13 year old thursday he does not need an id correct if so what time does gate')
('Tweet', 7, u'southwestair rocks open letter to flight attendant http co zfroinpszi lt my child was inspired th