## Airline Tweets using HDP

In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
%autosave 120
pd.set_option('display.max_colwidth', 900)

Autosaving every 120 seconds


In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [3]:
tweets = pd.read_csv('data/twitter-airline/Tweets.csv', usecols=['text'])
tweets.head(10)

Unnamed: 0,text
0,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials to the experience... tacky.
2,@VirginAmerica I didn't today... Must mean I need to take another trip!
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"
4,@VirginAmerica and it's a really big bad thing about it
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA
6,"@VirginAmerica yes, nearly every time I fly VX this ‚Äúear worm‚Äù won‚Äôt go away :)"
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP"
8,"@virginamerica Well, I didn't‚Ä¶but NOW I DO! :-D"
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me."


In [4]:
tweets.shape

(14640, 1)

In [5]:
import re

HANDLE = '@\w+'
LINK = 'https?://t\.co/\w+'
SPECIAL_CHARS = '&lt;|&lt;|&amp;|#'
def clean(text):
    text = re.sub(HANDLE, ' ', text)
    text = re.sub(LINK, ' ', text)
    text = re.sub(SPECIAL_CHARS, ' ', text)
    return text

tweets['text'] = tweets.text.apply(clean)
tweets.head(10)

Unnamed: 0,text
0,What said.
1,plus you've added commercials to the experience... tacky.
2,I didn't today... Must mean I need to take another trip!
3,"it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces they have little recourse"
4,and it's a really big bad thing about it
5,seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA
6,"yes, nearly every time I fly VX this ‚Äúear worm‚Äù won‚Äôt go away :)"
7,"Really missed a prime opportunity for Men Without Hats parody, there."
8,"Well, I didn't‚Ä¶but NOW I DO! :-D"
9,"it was amazing, and arrived an hour early. You're too good to me."


In [11]:
tweets['text'].sample(frac=0.60,replace=False,random_state=0).to_csv('./data/twitter-airline/airline.txt',encoding='utf-8')

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
file='./data/twitter-airline/airline.txt'
f=open(file,'r',encoding='utf-8')
text=f.read()
f.close()

In [14]:
text



In [15]:
doc=nlp(text)

In [63]:
pos_list=['NOUN'] #We can experiment with other or a combinations of parts of speech ['NOUN','ADJ','VERB','ADV'] #['NOUN','ADJ']

In [64]:
preproc_text=[]
preproc_sent=[]

for token in doc:
    if token.text!='\n':
        if not(token.is_stop) and not(token.is_punct) and token.pos_ in pos_list:
            preproc_sent.append(token.lemma_)
    else:
        preproc_text.append(preproc_sent)
        preproc_sent=[]

preproc_text.append(preproc_sent) #last sentence

print(preproc_text)






In [65]:
len(preproc_text)

8300

## HDP

In [66]:
import tomotopy as tp

In [67]:
mdl = tp.HDPModel(seed=0)

for line in preproc_text:
    mdl.add_doc(line)

for i in range(0, 110, 10):
    mdl.train(i)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=7))

Iteration: 0	Log-likelihood: -6.790204041731927
Iteration: 10	Log-likelihood: -6.554987880266551
Iteration: 20	Log-likelihood: -6.535618356844007
Iteration: 30	Log-likelihood: -6.5145191629497
Iteration: 40	Log-likelihood: -6.513512889040095
Iteration: 50	Log-likelihood: -6.514990878166678
Iteration: 60	Log-likelihood: -6.514866866023448
Iteration: 70	Log-likelihood: -6.516189687603871
Iteration: 80	Log-likelihood: -6.5109781533221245
Iteration: 90	Log-likelihood: -6.517808270080196
Iteration: 100	Log-likelihood: -6.511572948375637
Top 10 words of topic #0
[('flight', 0.07341839373111725), ('customer', 0.024134038016200066), ('service', 0.0222646314650774), ('phone', 0.022094685584306717), ('thank', 0.021924739703536034), ('time', 0.020905064418911934), ('ticket', 0.018695766106247902)]
Top 10 words of topic #1
[('flight', 0.10860554128885269), ('hour', 0.035504020750522614), ('thank', 0.027905652299523354), ('bag', 0.02240338735282421), ('time', 0.02201036922633648), ('plane', 0.02201

Top 10 words of topic #30
[('today', 0.08325633406639099), ('decision', 0.0370541512966156), ('hunt', 0.0370541512966156), ('scavenger', 0.0370541512966156), ('charity', 0.0370541512966156), ('law', 0.027813712134957314), ('brother', 0.027813712134957314)]
Top 10 words of topic #31
[('delay', 0.03575240075588226), ('week', 0.03575240075588226), ('merger', 0.03575240075588226), ('passenger', 0.017965136095881462), ('face', 0.017965136095881462), ('meal', 0.017965136095881462), ('travel', 0.017965136095881462)]
Top 10 words of topic #32
[('fleet', 0.3224663734436035), ('fleek', 0.3184857964515686), ('flight', 0.03188440576195717), ('üòÇ', 0.015962105244398117), ('airline', 0.011981530115008354), ('üò≠', 0.008000954985618591), ('lol', 0.008000954985618591)]
Top 10 words of topic #33
[('time', 0.1498955488204956), ('flight', 0.09540459513664246), ('departure', 0.04545454680919647), ('agent', 0.027290891855955124), ('boarding', 0.027290891855955124), ('min', 0.02274997904896736), ('bag', 

[('seat', 0.19354724884033203), ('flight', 0.08095841109752655), ('class', 0.06336640566587448), ('refund', 0.02466399408876896), ('ticket', 0.02466399408876896), ('row', 0.021145591512322426), ('policy', 0.017627190798521042)]
Top 10 words of topic #62
[('north', 0.027135947719216347), ('inconvenience', 0.027135947719216347), ('understatement', 0.027135947719216347), ('shit', 0.027135947719216347), ('path', 0.027135947719216347), ('nigga', 0.027135947719216347), ('torture', 0.027135947719216347)]
Top 10 words of topic #63
[('service', 0.1426079422235489), ('customer', 0.13502322137355804), ('line', 0.04855738580226898), ('agent', 0.044006556272506714), ('phone', 0.042489610612392426), ('flight', 0.03642183542251587), ('people', 0.0318710021674633)]
Top 10 words of topic #64
[('763', 0.0323510579764843), ('hour', 0.0003203075029887259), ('time', 0.0003203075029887259), ('service', 0.0003203075029887259), ('thank', 0.0003203075029887259), ('customer', 0.0003203075029887259), ('bag', 0.0

[('state', 0.02867688611149788), ('voice', 0.02867688611149788), ('lgbt.can', 0.02867688611149788), ('boycott', 0.02867688611149788), ('discrimination', 0.02867688611149788), ('flight', 0.000283929577562958), ('hour', 0.000283929577562958)]
Top 10 words of topic #93
[('thank', 0.0521610863506794), ('luggage', 0.043482035398483276), ('scale', 0.034802988171577454), ('travel', 0.026123937219381332), ('photo', 0.026123937219381332), ('model', 0.026123937219381332), ('money', 0.026123937219381332)]
Top 10 words of topic #94
[('flight', 0.16823582351207733), ('wine', 0.05615333095192909), ('payment', 0.033736828714609146), ('pairing', 0.02252858132123947), ('rate', 0.02252858132123947), ('bottle', 0.02252858132123947), ('partner', 0.02252858132123947)]
Top 10 words of topic #95
[('kid', 0.03924248367547989), ('singer', 0.01971885934472084), ('group', 0.01971885934472084), ('mark', 0.01971885934472084), ('life', 0.01971885934472084), ('scheduling', 0.01971885934472084), ('fun', 0.01971885934

[('checkin', 0.029514901340007782), ('bagage', 0.029514901340007782), ('guess', 0.029514901340007782), ('boardingpass', 0.029514901340007782), ('hour', 0.00029222675948403776), ('time', 0.00029222675948403776), ('service', 0.00029222675948403776)]
Top 10 words of topic #124
[('life', 0.053539663553237915), ('compensation', 0.03575240075588226), ('infant', 0.017965136095881462), ('crisis', 0.017965136095881462), ('way', 0.017965136095881462), ('exercise', 0.017965136095881462), ('pump', 0.017965136095881462)]
Top 10 words of topic #125
[('airline', 0.022335249930620193), ('screw', 0.022335249930620193), ('upgrade', 0.022335249930620193), ('food', 0.022335249930620193), ('flyfi', 0.022335249930620193), ('agent', 0.022335249930620193), ('savethediagonal', 0.022335249930620193)]
Top 10 words of topic #126
[('city', 0.02336880937218666), ('baggage', 0.02336880937218666), ('pillow', 0.02336880937218666), ('limit', 0.02336880937218666), ('voucher', 0.02336880937218666), ('park', 0.02336880937

In [68]:
mdl.train(50)
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=7))

Top 10 words of topic #0
[('flight', 0.07203656435012817), ('customer', 0.024709336459636688), ('phone', 0.023317359387874603), ('time', 0.02279536798596382), ('service', 0.021229393780231476), ('thank', 0.021055396646261215), ('ticket', 0.020881399512290955)]
Top 10 words of topic #1
[('flight', 0.11178912222385406), ('hour', 0.03536001965403557), ('thank', 0.02924024872481823), ('plane', 0.022168517112731934), ('time', 0.020536579191684723), ('bag', 0.019720610231161118), ('delay', 0.0171367097645998)]
Top 10 words of topic #2
[('flight', 0.16217093169689178), ('attendant', 0.05193879082798958), ('people', 0.04545454680919647), ('room', 0.03248605877161026), ('chicken', 0.019517572596669197), ('manner', 0.019517572596669197), ('dal', 0.019517572596669197)]
Top 10 words of topic #3
[('flight', 0.09166140854358673), ('plane', 0.08307084441184998), ('hour', 0.051572076976299286), ('seat', 0.04011797904968262), ('drink', 0.0372544527053833), ('entertainment', 0.025800354778766632), ('cre

[('science', 0.02788514457643032), ('taking', 0.02788514457643032), ('share', 0.02788514457643032), ('datum', 0.02788514457643032), ('approach', 0.02788514457643032), ('hammer', 0.02788514457643032), ('hour', 0.00027609054814092815)]
Top 10 words of topic #32
[('fleek', 0.3177781105041504), ('fleet', 0.31394994258880615), ('flight', 0.03449200093746185), ('airline', 0.01535104587674141), ('üò≠', 0.011522853747010231), ('info', 0.011522853747010231), ('üòí', 0.007694663479924202)]
Top 10 words of topic #33
[('time', 0.17554675042629242), ('flight', 0.10804607719182968), ('departure', 0.049545496702194214), ('agent', 0.027045270428061485), ('min', 0.022545225918293), ('thing', 0.022545225918293), ('ticket', 0.022545225918293)]
Top 10 words of topic #34
[('system', 0.09015163034200668), ('ticket', 0.045113347470760345), ('bag', 0.045113347470760345), ('issue', 0.045113347470760345), ('pair', 0.037606965750455856), ('transfer', 0.037606965750455856), ('company', 0.030100587755441666)]
To

[('option', 0.04760776832699776), ('haul', 0.023922311142086983), ('movie', 0.023922311142086983), ('website', 0.023922311142086983), ('meeting', 0.023922311142086983), ('route', 0.023922311142086983), ('winter', 0.023922311142086983)]
Top 10 words of topic #63
[('service', 0.12428544461727142), ('customer', 0.12290465086698532), ('line', 0.05110325664281845), ('flight', 0.044199276715517044), ('agent', 0.03867609426379204), ('people', 0.034533705562353134), ('hour', 0.03177211433649063)]
Top 10 words of topic #64
[('thx', 0.025752166286110878), ('fun', 0.025752166286110878), ('life', 0.025752166286110878), ('kid', 0.025752166286110878), ('singer', 0.025752166286110878), ('mht', 0.025752166286110878), ('grandkid', 0.025752166286110878)]
Top 10 words of topic #65
[('rest', 0.04256670922040939), ('math', 0.02138924039900303), ('athlete', 0.02138924039900303), ('summer', 0.02138924039900303), ('form', 0.02138924039900303), ('race', 0.02138924039900303), ('bike', 0.02138924039900303)]
Top 

[('money', 0.0517122708261013), ('thank', 0.0517122708261013), ('carry', 0.04310790076851845), ('luggage', 0.03450353071093559), ('travel', 0.02589915692806244), ('weight', 0.02589915692806244), ('model', 0.02589915692806244)]
Top 10 words of topic #94
[('flight', 0.09399202466011047), ('wine', 0.05878901854157448), ('sfo', 0.03532034531235695), ('rate', 0.03532034531235695), ('payment', 0.03532034531235695), ('benefit', 0.03532034531235695), ('class', 0.023586012423038483)]
Top 10 words of topic #95
[('flight', 0.00033090668148361146), ('hour', 0.00033090668148361146), ('time', 0.00033090668148361146), ('service', 0.00033090668148361146), ('thank', 0.00033090668148361146), ('customer', 0.00033090668148361146), ('bag', 0.00033090668148361146)]
Top 10 words of topic #96
[('hook', 0.0323510579764843), ('flight', 0.0003203075029887259), ('hour', 0.0003203075029887259), ('time', 0.0003203075029887259), ('service', 0.0003203075029887259), ('thank', 0.0003203075029887259), ('customer', 0.000

[('contract', 0.024502668529748917), ('website', 0.024502668529748917), ('neveragain', 0.024502668529748917), ('breach', 0.024502668529748917), ('damage', 0.024502668529748917), ('hack', 0.024502668529748917), ('joker', 0.024502668529748917)]
Top 10 words of topic #125
[('boy', 0.029514901340007782), ('tomorrow', 0.029514901340007782), ('dream', 0.029514901340007782), ('fulfill', 0.029514901340007782), ('hour', 0.00029222675948403776), ('time', 0.00029222675948403776), ('service', 0.00029222675948403776)]
Top 10 words of topic #126
[('flight', 0.00033090668148361146), ('hour', 0.00033090668148361146), ('time', 0.00033090668148361146), ('service', 0.00033090668148361146), ('thank', 0.00033090668148361146), ('customer', 0.00033090668148361146), ('bag', 0.00033090668148361146)]
Top 10 words of topic #127
[('flt', 0.041683945804834366), ('worker', 0.02094566449522972), ('rock', 0.02094566449522972), ('star', 0.02094566449522972), ('employee', 0.02094566449522972), ('checkin', 0.02094566449

In [69]:
bag_of_words=[word for sent in preproc_text for word in sent]

In [70]:
doc_inst = mdl.make_doc(bag_of_words)
mdl.infer(doc_inst)[0]

[3.5834332265949342e-06,
 3.5834332265949342e-06,
 -1.0279952009217187e+31,
 1.401298464324817e-45,
 0.0,
 0.0,
 7.563697057926899e+20,
 -0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.714840443586184e-15,
 0.0,
 0.0,
 0.0,
 7.726153155485222e+20,
 -9.048319092641594e-34,
 1.6994448514856065e+26,
 2.7648012483945446e+27,
 1.628412158548414e+26,
 4.404380268905838e+28,
 3.3350903450930646e-43,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 7.976522033894453e-20,
 595065839812608.0,
 2.79303693164338e+30,
 2.3841077742775678e-37,
 5.1577172186864244e-40,
 9.07425894688638e-34,
 3.748278763244942e-15,
 0.0,
 3.67343704960792e-15,
 0.0,
 nan,
 nan,
 0.0,
 0.0,
 3.400951372916331e-42,
 1.401298464324817e-45,
 0.0,
 0.0,
 1.4717271596964565e-06,
 0.0,
 4.7324811577138456e-14,
 0.0,
 4.5902067276294353e-14,
 0.0,
 4.821944438789666e-14,
 0.0,
 0.0,
 0.0,
 3.71498359215427e-15,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 4.823488410445921e-14,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 4.8595577838454404e-14,
 0.0,
 4.859580

In [71]:
np.argsort(np.array(mdl.infer(doc_inst)[0]))[::-1]

array([ 40,  41,  32,  23,  21,  20,  22,  18,   6,  31, 130,   0,   1,
        48, 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104,
       102, 100,  98,  96,  94,  92,  90,  88,  86,  84,  82,  80,  78,
        76,  74,  72, 128,  64,  54,  50,  52,  36,  58,  14,  38,  30,
        35,  33,  34,  44,  24,   3,  45, 139, 137,  37,  39, 138,  42,
        43,  46,  47, 136,  49, 140,  27,  29,  28, 147,   4,   5, 146,
         7,   8,   9,  10,  11,  12,  13,  15,  16,  17, 145, 144, 143,
       142, 141,  25,  26,  51, 109,  53,  95, 131,  85, 129,  87,  89,
       127,  91, 125,  93, 123, 121, 107,  97, 119,  99, 117, 101, 115,
       103, 113, 105, 111,  83, 132,  81, 133,  55,  56,  57,  59,  60,
        61,  62,  63,  65,  66,  67,  68,  69,  70,  71,  73, 148,  75,
       135,  77, 134,  79, 149,  19,   2], dtype=int64)

In [76]:
print(mdl.get_topic_words(40, top_n=7))

[('plane', 0.06444872915744781), ('pilot', 0.03230472281575203), ('power', 0.016232721507549286), ('flightaware', 0.016232721507549286), ('stop', 0.016232721507549286), ('family', 0.016232721507549286), ('medium', 0.016232721507549286)]


In [77]:
print(mdl.get_topic_words(41, top_n=7))

[('point', 0.06342929601669312), ('change', 0.04761151596903801), ('metal', 0.04761151596903801), ('confusion', 0.031793735921382904), ('snack', 0.031793735921382904), ('cheese', 0.0159759558737278), ('mile', 0.0159759558737278)]


In [78]:
print(mdl.get_topic_words(2, top_n=7))

[('flight', 0.16217093169689178), ('attendant', 0.05193879082798958), ('people', 0.04545454680919647), ('room', 0.03248605877161026), ('chicken', 0.019517572596669197), ('manner', 0.019517572596669197), ('dal', 0.019517572596669197)]


In [79]:
print(mdl.get_topic_words(19, top_n=7))

[('flight', 0.0666356012225151), ('seat', 0.05997869744896889), ('window', 0.04666489362716675), ('agent', 0.04000798985362053), ('row', 0.04000798985362053), ('flyer', 0.04000798985362053), ('boarding', 0.020037278532981873)]
