In [18]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
stop_words = set(stopwords.words('english')) 
ps = PorterStemmer()

## Get a sample of the data

In [11]:
samples = np.random.choice(np.arange(1, 1011652), 911652, replace=False)

In [12]:
db = pd.read_csv('data/social_listening1/original_tweets.csv', lineterminator='\n')

In [13]:
db.head()

Unnamed: 0,_id,created_at,favorite_count,full_text,retweet_count,u4u_dataset,user.created_at,user.description,user.followers_count,user.friends_count,user.lang,user.listed_count,user.location,user.name,user.screen_name
0,5bd86c7968a761d62501fda2,Sun Sep 23 15:01:52 +0000 2018,1,2016\nRemember when #PeterDutton was found gui...,0,asylumseeker,Wed Oct 02 00:18:21 +0000 2013,This page is sharing information about the Lib...,6490,5435,en,261,Western Australia,TALAOLP,Talaolp
1,5bd86c7968a761d62501fda5,Sun Sep 23 11:09:04 +0000 2018,1,"This pains me, but it's time to compromise on ...",1,asylumseeker,Sat Nov 28 02:34:51 +0000 2009,"big fan of irreverent political commentary, c...",1966,3281,en,125,"country Victoria, Australia",eithne,eithne52
2,5bd86c7968a761d62501fda9,Sun Sep 23 02:01:55 +0000 2018,7,What about all his other well paid jobs! He ha...,2,asylumseeker,Mon Oct 22 07:25:28 +0000 2012,#Wiimpitja - black fella #BarkindjiNation #Kal...,8583,5514,en,358,,Paul Dutton,pauldutton1968
3,5bd86c7968a761d62501fdab,Sun Sep 23 00:47:55 +0000 2018,2,A must read analysis of policy paralysis on as...,2,asylumseeker,Sat Aug 03 03:08:39 +0000 2013,"Senior Counsel, AWL Woman Lawyer of the Year, ...",5019,932,en,85,Australia,Fi McLeod SC,FiMcLeodSC
4,5bd86c7968a761d62501fdaf,Sat Sep 22 02:49:45 +0000 2018,0,"""@halyapuff: #Ukrainian prosecutor general adm...",0,asylumseeker,Fri Feb 04 22:40:35 +0000 2011,,2097,1853,en,83,THE MOON,Gaby Skittles friend,GABchaag10


In [14]:
db.shape

(1011563, 15)

## Find highest tweeted keywords

In [30]:
def preprocess(sent):
    string = ''
    for word in sent.split(' '): 
        word = word.lower()
        if word.find('https') == -1 and word.find('amp') == -1:
            curr = ps.stem(word)
            string += curr + ' '
    return string

In [7]:
keywords = db['u4u_dataset'].value_counts()


In [8]:
print(list(keywords.index))


['immigrants', 'migrants', 'asylum', 'refugee', 'rohingya', 'unhcr', '@Refugees', 'withrefugees', 'RefugeesWelcome', 'asylumseeker', '@UNRefugeeAgency', 'syrianrefugees', 'syrianrefugee', 'rohingyarefugees', 'USA', 'TEDxKakumaCamp']


## Find the most common words in each U4U tag

In [31]:
for keyword in keywords.index:
    rows = db.loc[db['u4u_dataset'] == keyword]
    text = rows['full_text'].values
    count_vec = CountVectorizer(preprocessor=preprocess, stop_words=stop_words)
    vec = count_vec.fit(text)
    bag_of_words = vec.transform(text)

    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    print(keyword)
    for i in range(5):
        print(words_freq[i])
    print('\n')

immigrants
('immigr', 347374)
('immigrants', 128885)
('illeg', 100848)
('thi', 68945)
('trump', 52502)


migrants
('migrant', 139008)
('migrants', 35543)
('border', 20737)
('thi', 19656)
('eu', 18974)


asylum
('asylum', 157292)
('seek', 23870)
('seeker', 23387)
('thi', 22738)
('wa', 14697)


refugee
('refuge', 127547)
('thi', 17874)
('children', 16550)
('refugee', 14532)
('wa', 11202)


rohingya
('rohingya', 58167)
('myanmar', 23679)
('india', 10943)
('refuge', 9802)
('deport', 7896)


unhcr
('unhcr', 8569)
('refuge', 6983)
('thi', 1421)
('un', 1177)
('refugees', 1130)


@Refugees
('refuge', 11369)
('un', 1912)
('thi', 1354)
('refugees', 971)
('honahmedhussen', 852)


withrefugees
('withrefuge', 6587)
('thi', 4845)
('children', 4663)
('migrant', 4582)
('un', 4575)


RefugeesWelcome
('refugeeswelcom', 4670)
('refuge', 2558)
('000', 913)
('thi', 892)
('us', 760)


asylumseeker
('asylumseek', 212)
('asylum', 76)
('nauru', 70)
('refuge', 62)
('thi', 52)


@UNRefugeeAgency
('unrefugeeag', 

In [101]:
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
words_freq[:10]

[('co', 1252),
 ('https', 1250),
 ('unhcr', 884),
 ('refuge', 694),
 ('amp', 188),
 ('thi', 155),
 ('refugees', 130),
 ('help', 101),
 ('ha', 95),
 ('need', 92)]

## Topic Clustering

In [19]:
documents = db['full_text']

In [32]:
no_features = 1000

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english', preprocessor=preprocess)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english', preprocessor=preprocess)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [33]:
no_topics = 20

nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [34]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
immigr undocu legal american america ice mani say pay canada
Topic 1:
asylum seeker seek insan seekers run arkham lunat mental coven
Topic 2:
refuge syrian unhcr crisi support nauru help famili year 000
Topic 3:
migrant caravan rescu italy gujarat itali home attack polic europ
Topic 4:
immigrants women legal american hate undocu white nation realdonaldtrump america
Topic 5:
rohingya myanmar india deport muslim bangladesh genocid seven militari report
Topic 6:
illeg legal alien realdonaldtrump law number aliens foxnew stop deport
Topic 7:
thank use better connect hp await tech futur opportun education
Topic 8:
trump administr judg block protect public end green 000 card
Topic 9:
thi countri country year week make happen time read whi
Topic 10:
border mexico caravan guatemala cross honduran stop mexican polic southern
Topic 11:
wa hi said year did refugee becaus thought didn good
Topic 12:
children detent withrefuge member globalcompactrefuge justified circumst formigr reflect g

In [35]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
need becaus onli support white way thi black home presid
Topic 1:
border democrat women state doesn benefit news thousand blame parti
Topic 2:
come thi make let million life realli came hard health
Topic 3:
right non befor migrants human today big contribut thought act
Topic 4:
protect day republican everi crimin point thank enter gener control
Topic 5:
children famili ani mani citizen tax problem 000 end wall
Topic 6:
just like use help money uk poor citizens refugees aid
Topic 7:
america live nation hate thi open number actual unit immigration
Topic 8:
immigr immigrants illeg migrant undocu tri didn ll ask away
Topic 9:
crime tell hous muslim fight brexit fuck british video murder
Topic 10:
hi whi care deport did usa fact speak claim run
Topic 11:
public racist love includ left bring group treat commit low
Topic 12:
wa time job thi said ha canada build attack turn
Topic 13:
asylum doe free ice stand face wrong power potu cnn
Topic 14:
look govern talk veri polici like judg b