In [2]:
import re

import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag.stanford import StanfordNERTagger
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from  textacy.vsm import Vectorizer

from tqdm import *
from pprint import pprint

from pymprog import *

Getting all the tweets from the twitter api and then saving it here

In [3]:
tweets = pd.read_csv('./tweet.csv',encoding='ISO-8859-1')

## Data Preprocessing

In [4]:
tweets.dropna(inplace=True)
tweets = tweets[tweets.retweets != 0]
tweets.to_csv('./preprocessed_tweets.csv')
tweets.shape

(5120, 6)

Extracting text from the tweets dataframe

Removing URLs, Removing @..., and the hashtags

In [5]:
# Building the corpus
tweet_text = []
tweets.text = tweets.text.apply(lambda x: re.sub(u'https:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'http:\S+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'@\w+', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'#', u'', x))
tweets.text = tweets.text.apply(lambda x: re.sub(u'â|¦|:|;|-|\x80|Â|µ|à|&|\/', u'', x))
tweets.text = tweets.text.apply(lambda x: x.replace(u'RT', u''))


## Tokenizing with nltk

In [6]:
tknzr = TweetTokenizer()

nltk_tweets = []
for text in tweets.text:
    nltk_tweets.append(tknzr.tokenize(text))
nltk_tweets[-68]

['While',
 'good',
 'for',
 'nothing',
 'spits',
 'her',
 'venom',
 'on',
 'them',
 ',',
 'these',
 'heros',
 'are',
 'elsewhere',
 'doing',
 'the',
 'job',
 'asusual',
 '.',
 'To',
 'the',
 'Army',
 '.',
 'KeralaFloods',
 'IndianArmy',
 'Keralarains',
 'twitter',
 '.',
 'comfirstpoststat',
 'us1027831034149261312']

In [7]:
#nltk.download()

Using POS tagger to get the array of various part of speech in the tweet

In [8]:
nltk_pos = []

for text in nltk_tweets:
    nltk_pos.append(pos_tag(text))
pprint(nltk_pos[-68])
print(ne_chunk(nltk_pos[-68]))

[('While', 'IN'),
 ('good', 'JJ'),
 ('for', 'IN'),
 ('nothing', 'NN'),
 ('spits', 'VBZ'),
 ('her', 'PRP$'),
 ('venom', 'NN'),
 ('on', 'IN'),
 ('them', 'PRP'),
 (',', ','),
 ('these', 'DT'),
 ('heros', 'NNS'),
 ('are', 'VBP'),
 ('elsewhere', 'RB'),
 ('doing', 'VBG'),
 ('the', 'DT'),
 ('job', 'NN'),
 ('asusual', 'JJ'),
 ('.', '.'),
 ('To', 'TO'),
 ('the', 'DT'),
 ('Army', 'NNP'),
 ('.', '.'),
 ('KeralaFloods', 'NNP'),
 ('IndianArmy', 'NNP'),
 ('Keralarains', 'NNP'),
 ('twitter', 'NN'),
 ('.', '.'),
 ('comfirstpoststat', 'NN'),
 ('us1027831034149261312', 'NN')]
(S
  While/IN
  good/JJ
  for/IN
  nothing/NN
  spits/VBZ
  her/PRP$
  venom/NN
  on/IN
  them/PRP
  ,/,
  these/DT
  heros/NNS
  are/VBP
  elsewhere/RB
  doing/VBG
  the/DT
  job/NN
  asusual/JJ
  ./.
  To/TO
  the/DT
  (ORGANIZATION Army/NNP)
  ./.
  (ORGANIZATION KeralaFloods/NNP)
  (ORGANIZATION IndianArmy/NNP Keralarains/NNP)
  twitter/NN
  ./.
  comfirstpoststat/NN
  us1027831034149261312/NN)


Tried Named entity recognition using NLTK but not accurate

In [9]:
#pattern = 'NP: {<DT>?<JJ>*<NN>}'
#cp = nltk.RegexpParser(pattern)
#cs = cp.parse(nltk_pos[-68])
#print(cs)

In [10]:
#iob_tagged= tree2conlltags(cs)
#pprint(iob_tagged)

Now using Stanford Natural Processing!!
First, we will set the config_java file for nltk

In [11]:
nltk.internals.config_java("/usr/lib/jvm/java-11-openjdk-amd64/bin/java")
st = StanfordNERTagger('/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
           '/home/pranav/Desktop/zine/Twitter-Mining/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')

In [12]:
nltk_ents = []
for tweet in tqdm(nltk_tweets):
    entity_tagged_tweet = st.tag(tweet)
    nltk_ents.append([tag for tag in entity_tagged_tweet if tag[1] != 'O'])

100%|██████████| 5120/5120 [6:23:53<00:00,  1.65s/it]       


The Standford Named Entity Recognition library labels the text in the tweets, particularly into 3 classes (PERSON, ORGANIZATION, LOCATION).<br>
As, numerals will also be significant in the tweets we will concatenate it to the entity text. Hence, from the text we will take care about the entities and numbers.<br>
I will name these array content_tweets

Again, entities that are labelled as PERSON tend to be related more to feelings of the person, hence I will remove them as well.

In [36]:
content_tweets = []
for pos_tweet, tweet_entity in zip(nltk_pos, nltk_ents):
    # starting by appending all of the entities
    tweet_content = [word[0] for word in tweet_entity if word[1] != 'PERSON']
    
    # next by appending all of the numerals
    for token in pos_tweet:
         if token[1] == u'CD':
            tweet_content.append(token[0])
    content_tweets.append(tweet_content)
for i in range(len(content_tweets)):
    print(i, content_tweets[i])

0 ['India', '2cE5lLWjQ']
1 ['Thailand', 'Kerala', 'KeralaFloods', 'Kerala', 'BBC', '12', '12']
2 ['Kerala', 'State', 'Kottayam', 'K', 'Raju', 'of', 'Communist', 'Party', 'Muslim', 'League', 'Germany']
3 ['Kerala', 'CM', 'Relief', 'Fund', '7yaOyOp']
4 ['Kadavanthra', '2402199', '8status1030591901890572288']
5 ['Toronto', 'CAN', 'Union', 'Station', 'Wisconsin', 'Pennsylvania', '3']
6 ['Ahmadiyya', 'Youth', 'association', 'kerala', 'Humanity', 'First', 'India', 'Kerala', 'zero', '2018', '2dm']
7 ['US', '2018', '2018aug', '17keralafloodsdeathtollrescueeffortindia']
8 ['One']
9 ['2378962', '6372784', '0']
10 ['India', '2VyGyISxr4']
11 ['9gKE1g']
12 ['10', '15', '25', '76', '18', '27', '9496743525', '30', '45']
13 ['Kerala', 'India', '3,00', '000', '1MdzYpf29C']
14 ['million']
15 ['5cogmhbQ']
16 []
17 ['Kerala', 'Dubai']
18 ['8node', '8891257031']
19 ['Edanadu', '2', '2KC8irZ']
20 []
21 ['Kerala']
22 ['india']
23 ['Nedubeshary', 'Dew', 'Homes', 'Apartment', 'Kochi', 'Airport', 'Road', '260',

1544 []
1545 ['navy']
1546 ['Collector', 'Trivandrum', 'SMV', 'High', 'School', 'TVM']
1547 ['60CCUgV']
1548 ['Kerala', '6S3uIpap']
1549 ['Aluva', '9190378767', '65', '81koxOVc']
1550 ['BCCI']
1551 ['IStandWithKerala', 'Stars']
1552 ['10', '2', '9', '20', '08.5', '76', '37', '49.9', '2018', '93', '10303662769', '4891827', '2']
1553 ['Kerala', 'South', 'of', 'India', '9OQCJjOp']
1554 ['Kerala']
1555 ['India', 'India', '30', 'million']
1556 ['Kerala', '6g0IX94D']
1557 ['4plfzV1pa']
1558 ['Kerala']
1559 []
1560 []
1561 ['85', '9645086486', '3706118', '92', '8776142271']
1562 ['7uwf']
1563 ['5r6y3yzKb']
1564 ['2018', '6547DoQeO']
1565 ['239']
1566 []
1567 ['10', '17aug400pm']
1568 ['Kerala', 'Flood', 'Relief', '¹100', '8k1vOSXnCo']
1569 ['Amazon', '25X4b582']
1570 []
1571 ['Chengannur', '2', '8606959595.SOS', '689506']
1572 ['10Lakh', '1Bhl']
1573 ['890727']
1574 ['FMCG', 'Kerala']
1575 ['2018']
1576 ['10', '70266vtwqE']
1577 ['KolamavuKokila', 'Coco', 'KeralaFlood', 'Kerala', '10', '6ZW0Z

3072 ['4', '3']
3073 ['Distress', 'Relief', 'Found', 'Save', 'kerala', '4']
3074 ['NeedHelp', 'SOS', 'People', 'Families', 'Description', '2018', '20', '9846357379']
3075 ['Injakkelkavala', 'Kalady', 'Eranakulam', 'Phone', 'one', '2', '3', '099460 5030', '1']
3076 ['India', '6yrhamstatu']
3077 ['Puzhakkattiri', 'Malappuram', 'Kerala', '2FbZStmJ']
3078 ['1', '2', '3']
3079 []
3080 ['Kerala', '¹10', '6Gf']
3081 ['one']
3082 ['4000']
3083 ['Sanskrut']
3084 ['Pathanamthitta', 'Prayforkerala', '4gCKSPbW']
3085 ['one', '721', '16']
3086 ['Kerala', '1s', '10300595148', '7840665', '6']
3087 ['10lakhs']
3088 ['NSS', 'Building', 'Mundankavu', 'one', '9544934084']
3089 ['Kerala', '43Uh5VTL8B']
3090 []
3091 ['Kottiyoor', 'Kannur', 'Kerala']
3092 ['Vodafone', 'Vodafone', 'Kerala', '22', '144', '130', '1', '1']
3093 ['KeralaFloods', 'Indian', 'Army', 'Valiyakada', 'Malampuzha', '35feet', '2018', '9E']
3094 ['1GkOQ50QS']
3095 ['Ernakulam', 'PCO', '2OgqnBW']
3096 ['Kerala', '10Lakhs']
3097 ['BJP', '10

4779 ['2QfpABW']
4780 ['Kerala', '25Lakhs', '10282430785', '8537676', '9']
4781 ['Brothers', '25', '1N9AGqCtTp']
4782 ['Kerala', '25', '0dBsNFw3']
4783 ['18FC4Society', '5pE']
4784 ['Kerala', '25', '6ivKupaKzu']
4785 ['Idukki', '7dSq']
4786 ['25']
4787 ['India', 'Kerala', '10282368240', '8958361', '6']
4788 ['2018', '2m84fSD6r']
4789 ['25Lacs', '4', '6WQNd3r7G']
4790 ['25', '7pxvrKxSrZ']
4791 ['25']
4792 ['Kerala']
4793 ['25Lacs']
4794 ['Kannur', 'Kerala']
4795 ['¹25L', '1dJKDmmeJ']
4796 ['25', '8e']
4797 ['2riLQS6']
4798 ['KeralaRains', 'KeralaFloods', 'Help', 'AnboduKochi', 'Ernakulam', '13030290702', '478', '2posts1817567048298351', '7Y']
4799 ['Kerala', '25', '12QIF1LShM']
4800 ['Kerala', '25Lacs', '2018', '6dlPV7Rv']
4801 ['AICC', 'Central', 'Govt', '6ys23xJ']
4802 ['²', '²', '10282189921', '2776243', '3']
4803 ['Kerala', 'CM', 'Relief', 'Fund', '25Lacs', '1NV8F']
4804 ['25', '7BkQj']
4805 ['4kuG7MOJ']
4806 ['Kerala', '25L', '10282189921', '2776243', '3']
4807 ['6']
4808 ['Kerala'

# Getting the tf-idf score

Now, we will take out tl-idf score for the tweet that will determine how much the word present in the tweet is importants.<br>
So, I will take out the tl-idf score of all of the nltk_tweets

I care about the tf-idf scores of the entire tweet, so will tf-idf score across the entire corpus of original tweets.

In [37]:
vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth')
term_matrix = vectorizer.fit_transform(nltk_tweets)
term_matrix.shape

(5120, 18883)

The term_matrix is a document-term matrix of shape (#document, #unique terms).<br>
Each row is a document and each column is a  unique word.<br>
The values of the matrix is the tf-idf score of the particular unique word in the column

In [38]:
np_matrix = term_matrix.todense()
np_matrix.shape

np.max(np_matrix[:,527])

8.847957830902605

The goal is to make a dictionary, which map from tokens in the content_tweets to some tf-idf score.<br>
Each word has a unique tf-idf value<br>
In simple words, we will find the column number of each word from the content_tweets in the document-term matrix, as now the vectorizer model is trained, for this we will use vectorizer.vocablury_terms[word]<br>
Each column is a distinct word

In [16]:
for key in sorted(vectorizer.vocabulary_terms)[527:715]:
    print(key, vectorizer.vocabulary_terms[key])

10300995006 527
10301000518 528
10301010413 529
10301015371 530
10301053520 531
10301069730 532
10301074070 533
10301081111 534
10301083441 535
10301097646 536
10301143512 537
10301146224 538
10301167993 539
10301267691 540
10301323682 541
10301388428 542
10301395399 543
10301404464 544
10301411160 545
10301447925 546
10301454128 547
10301556459 548
10301616328 549
10301629472 550
10301637024 551
10301667916 552
10301708726 553
10301723228 554
10301740195 555
10301784633 556
10301866916 557
10301899468 558
10301911889 559
10302070006 560
10302088181 561
10302151702 562
10302167079 563
10302327271 564
10302570793 565
10302612194 566
10302811384 567
10302835155 568
10302862659 569
10302920407 570
10302972395 571
10303002310 572
10303052258 573
10303057788 574
10303074424 575
10303140485 576
10303169204 577
10303184414 578
10303185098 579
10303190337 580
10303234040 581
10303260009 582
10303294381 583
10303314058 584
10303341425 585
10303367359 586
10303368144 587
10303406319 588
10303420

Now to get the tf-idf score of all entities related to a specific content_tweet, we will use np.max function <br>
We will taken all the words from a specific content tweet and then use the vectorizer.vocabulary_terms to get the column-number of the word<br>
Then, we use np.max to take out the maximum value in the entire column for the word and that will be its tf-idf score

In [39]:
for token in content_tweets[1190]:
    print(token, vectorizer.vocabulary_terms[token], np.max(np_matrix[:,vectorizer.vocabulary_terms[token]]))

## We will now go through all of the content_tweets to get the dictionary

In [40]:
tfidf_dict = {}
content_vocab = []
for tweet in content_tweets: 
    for token in tweet: 
        if token not in tfidf_dict: 
            if token in vectorizer.vocabulary_terms:
                content_vocab.append(token)
                tfidf_dict[token] = np.max(np_matrix[:,vectorizer.vocabulary_terms[token]])
print(len(tfidf_dict))

5283


In [41]:
for key in sorted(tfidf_dict)[900:911]:
    print ("WORD:" + str(key) + " -- tf-idf SCORE:" +  str(tfidf_dict[key]))

WORD:18on -- tf-idf SCORE:8.847957830902605
WORD:18piSlWC7o -- tf-idf SCORE:8.847957830902605
WORD:18qnhvlZkF -- tf-idf SCORE:8.847957830902605
WORD:18stat -- tf-idf SCORE:8.154810650342661
WORD:18th -- tf-idf SCORE:7.238519918468505
WORD:18zdata -- tf-idf SCORE:8.847957830902605
WORD:19 -- tf-idf SCORE:11.971513899946274
WORD:19.1 -- tf-idf SCORE:8.847957830902605
WORD:1900136 -- tf-idf SCORE:8.847957830902605
WORD:1900751 -- tf-idf SCORE:8.847957830902605
WORD:1902159 -- tf-idf SCORE:8.847957830902605


## Content Word-based Tweet Summarization

To generate something that woul be more useful to people and other volunteers, It has to be something with content-words with high tf-idf scores.<br>
This can be done by using Integer Linear Programming(ILP) where we will maximize an equation given some constraints.<br>
Equation: Maximize the total score of content words in my summary<br>
\begin{equation}
\sum_{i=1}^n x_{i} + \sum_{j = 1}^{m} Score(j) \cdot y_{j}
\end{equation}
where, $x_{i}$ is 1 if the tweet is selected or 0 if the tweet is not selected, where $y_{j}$ is 1 or 0 if each content word is included (and Score(j) is that word's tf-idf score).<br>
\begin{equation}
\sum_{i=1}^n x_{i}.Length(i) <= L
\end{equation}
Constraint 1: The total length of all the selected tweets to be less than some value L, which will be the length of my summary, L. I can vary L depending on how long I want my summary to be. 
<br>
\begin{equation}
\sum_{i \in T_{j}} x_{i} \geq y_{j}, j = [1,...,m]
\end{equation}
Contraint 2:If I pick some content word $y_{j}$ (out of my $m$ possible content words) , then I want to have at least one tweet from the set of tweets which contain that content word, $T_{j}$.<br>
\begin{equation}
\sum_{j \in C_{i}} y_{j} \leq |C_{i}| \times x_{i}, i = [1,...,n]
\end{equation}
Constraint 3: If I pick a tweet in my summary, then all the content words in that tweet should be present in the summary<br>

Variables that the equation depend on are integers, 1 if the word is included and 0, if it is not

In [20]:
begin('COWTS')

model('COWTS') is the default model.

var function is used to create variables,  

In [21]:
# Defining the first variable x,
# This definies whether or not a tweet is selected
x = var('x', len(nltk_tweets), bool)
x[1000]

0 <= x[1000] <= 1 binary

In [22]:
# Defining the second variable y,
# This defines whether or not a content word is selected
y = var('y', len(content_vocab), bool)

In [23]:
len(y), y[0]

(5283, 0 <= y[0] <= 1 binary)

In [24]:
# Defining the equation that needs to be maximized
maximize(sum(x) + sum([tfidf_dict[content_vocab[j]]*y[j] for j in range(len(y))]));

In [25]:
## Constraint 1: Maximum length of entire tweet summary
# should be less than or equal to 150

L = 150
sum([x[i] * len(nltk_tweets[i]) for i in range(len(x))]) <= L;

In [26]:
#Constraint 2: If I pick a content word then I have to pick a tweet that contains the content word
def tweet_with_content_words(j):
    content_word = content_vocab[j]
    index_term_matrix = vectorizer.vocabulary_terms[content_word]
    matrix = np_matrix[:, index_term_matrix]
    
    return np.nonzero(matrix)[0]

In [27]:
for j in range(len(y)):
    sum([x[i] for i in tweet_with_content_words(j)]) >= y[j]

In [28]:
#Constraint 3: If i pick a tweet, then all of the content words from the tweet must be selected
def content_words(i):
    tweet = nltk_tweets[i]
    content_indices = []
    
    for token in tweet:
        if token in content_vocab:
            content_indices.append(content_vocab.index(token))
    
    return content_indices

In [29]:
for i in range(len(x)):
    sum(y[j] for j in content_words(i)) >= len(content_words(i)) * x[i]

In [30]:
solve()

(0,
 'The MIP problem instance has been successfully solved. (This code\ndoes {\\it not} necessarily mean that the solver has found optimal\nsolution. It only means that the solution process was successful.)')

In [31]:
result_x =  [value.primal for value in x]
result_y = [value.primal for value in y]

In [32]:
end()

model('COWTS') is not the default model.

In [33]:
chosen_tweets = np.nonzero(result_x)
chosen_words = np.nonzero(result_y)
chosen_tweets

(array([ 856, 2002, 3221, 3437, 3461, 4668, 4859, 4997]),)

In [34]:
len(chosen_tweets[0]), len(chosen_words[0])

(8, 83)

In [35]:
for i in chosen_tweets[0]:
    print ('--------------')
    print (" ".join(nltk_tweets[i]))

--------------
Please support KeralaFloods KeralaFloodRelief Medicine Food IndiaForKerala Contact and account details Abhayaloka Trust AC NO 4504220016 1601 Syndicate Bank Myngappally Branch IFSC SYNB 0004504 Email abhayalokakerala.com Contact + 9185477991 20 + 9194468670 74 pic.twitter . comQIHKP 639UN
--------------
* Declare KeralaFloods as nation calamity Rescuekerala RebuildKerala Morefundsforkerala Keralaflood 2018
--------------
Help KeralaFloods People 5 People ( 2 Old people above 70yrs ) Description Evacuation Number 9496950810 8547054877 Location Thiruvalla Nallad , North Side near Subramanian Swamy Temple District pathanamthitta Time of SoS call 500pm
--------------
Respect fans keralafloods twitter . comSuriyaFansClub status 10300204513 8743296 0 ? s = 19
--------------
KeralaFloods Prime Minister Asks Army to Intensify Rescue Operations twitter . comeOrganisersta tus 10300238020 8687513 8
--------------
Thank U Donated  ¹5L To KeralaFloods Releif Fund twitter . comTheDev