# Imports

In [13]:
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API
from tweepy import Cursor
import json
import datetime
import pandas as pd
import collections
from collections import Counter
from collections import defaultdict
import csv
import matplotlib.pyplot as plt
import numpy as np
from numpy import linalg as la
import time
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from array import array
import math
import operator
import pickle

In [3]:
df_tweets = pd.read_json('file.json', orient ='split', compression = 'infer') 
df_tweets = df_tweets.fillna(value=np.nan)

In [4]:
def getrts(df_tweets):
    retweets_ = df_tweets["retweeted_status"].apply(lambda x:0 if str(x)=='nan' else 1)
    return retweets_

In [5]:
#We recieve a list of 0 if tweet is unique and 1 if it is a retweet
retweets = getrts(df_tweets)
#We get the index of the retweets
retweets = retweets[retweets == 1]
retweets = retweets.index.values
#We now get df_unique which are the original tweets and df_retweeted which are the retweeted tweets
df_unique = df_tweets.drop(retweets,axis=0)
df_unique = df_unique.reset_index(drop=True)
df_retweeted = df_tweets.loc[retweets]
df_retweeted = df_retweeted[df_retweeted['retweeted_status'].notna()]
df_retweeted = df_retweeted.reset_index(drop=True)

In [6]:
#We get common columns between the retweeted and the original tweets 
cols = df_retweeted.columns&df_retweeted.at[0,'retweeted_status'].keys()

for i in range(0,len(df_retweeted)):
    for col in cols:
        if col in df_retweeted.at[i,'retweeted_status'].keys():
            #Substitute the columns of the retweeted tweet by the original one
            if col != 'possibly_sensitive':
                df_retweeted.at[i,col] = df_retweeted.at[i,'retweeted_status'][col] 

In [7]:
df_tweets = pd.concat([df_unique, df_retweeted], ignore_index=True)
df_tweets = df_tweets.drop_duplicates(subset=['id'])
df_tweets = df_tweets.reset_index(drop=True)

In [8]:
lines = [None]*len(df_tweets)
for i in range(0,len(df_tweets)):
    
    if type(df_tweets.at[i,'extended_tweet']) != float:
        lines[i] = df_tweets.at[i,'extended_tweet']['full_text']
    
    else:
        lines[i] = df_tweets.at[i,'text']

lines = [l.strip() for l in lines]

In [9]:
def remove_mentions(text):
    
    result = text.split()
    aux = text.split()
    for word in aux:
        if '@' in word:
            result.remove(word)
        elif 'https' in word:
            result.remove(word)
    result = ' '.join(result)
    
    return deEmojify(result)

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def delete_hashtags(text):
    
    result = text.split()
    aux = text.split()
    for word in aux:
        if '#' in word:
            result.remove(word)
    result = ' '.join(result)
    
    return result

def getTerms(line):
    
    #We remove mentions (@username) and links
    line = remove_mentions(line)
    #We remove Hashtags
    line = delete_hashtags(line)
    #We will get rid of RTs
    line= line.replace("RT ", "").strip()
    #We remove punctuations
    line=  line.lower() 
    line = re.sub(r'[^\w\s]','',line).strip()
    #We remove stop-words
    line=  line.split()
    stops = set(stopwords.words("english"))
    line=[word for word in line if word not in stops] 
    #Stemming
    stemming = PorterStemmer()
    line=[stemming.stem(word) for word in line] 
    
    return line

def create_index_tfidf(lines, numDocuments):
        
    index=defaultdict(list)
    
    #Term frequencies of terms in documents (tweets)
    tf=defaultdict(list) 
    
    #Document (tweet) frequencies of terms in the corpus
    df=defaultdict(int)         
    
    #Inverse document (tweet) frequencies
    idf=defaultdict(float)
    
    #Count we will use to assign IDs to tweets
    count = 0
    
    for line in lines:
        
        tweet_id = count
        
        #We get the terms of the tweet
        terms = getTerms(line)                     
        termdictPage={}

        for position, term in enumerate(terms): 
            try:
                #If the term is already in the dict append the position to the corrisponding list
                termdictPage[term][1].append(position) 
            except:
                #Add the new term as dict key and initialize the array of positions and add the position
                termdictPage[term]=[tweet_id, array('I',[position])]
        
        #Normalize term frequencies. Norm is the same for all terms of a document.
        norm=0
        for term, posting in termdictPage.items(): 
            norm+=len(posting[1])**2
        norm=math.sqrt(norm)
        
        #Calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in termdictPage.items():     
            tf[term].append(np.round(len(posting[1])/norm,4))  
            df[term] += 1
        
        #Merge the current page index with the main index
        for termpage, postingpage in termdictPage.items():
            index[termpage].append(postingpage)
        
        count += 1
    
    #Compute idf
    for term in df:
        idf[term] = np.round(np.log(float(numDocuments/df[term])),4)
           
    return index, tf, df, idf

In [10]:
start_time = time.time()
numDocuments = len(lines)
index, tf, df, idf = create_index_tfidf(lines, numDocuments)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time,2)))

Total time to create the index: 263.88 seconds


In [11]:
def rankDocuments(terms, docs, index, idf, tf):
        
    #We are interested only on the element of the docVector corresponding to the query terms, so remaining elemnts
    #would became 0 when multiplied to the queryVector
    docVectors=defaultdict(lambda: [0]*len(terms)) 
    queryVector=[0]*len(terms)    
    
    #Get frequency of each term in the query
    query_terms_count = collections.Counter(terms)  
    query_norm = la.norm(list(query_terms_count.values()))
    
    for termIndex, term in enumerate(terms): 
        if term not in index:
            continue
            
        #Compute tf*idf
        queryVector[termIndex]=query_terms_count[term]/query_norm * idf[term] 
        
        # Generate docVectors for matching docs
        for docIndex, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                docVectors[doc][termIndex]=tf[term][docIndex] * idf[term]  # Check if multiply for idf
    
    #Calculate the score of each doc computing cosine similarity between queryVector and each docVector
    docScores=[ [np.dot(curDocVec, queryVector), doc] for doc, curDocVec in docVectors.items() ]
    docScores.sort(reverse=True)
    resultDocs=[x[1] for x in docScores]
    
    if len(resultDocs) == 0:
        print("No results found")
        
    return resultDocs

def search_tf_idf(query, index):
    
    #Get terms of the query
    query=getTerms(query)
    docs=set()
    
    #Count will help us to avoid a problem when doing instersection
    count = 0
    
    for term in query:
        try:
            #Store in termDocs the ids of the docs that contain "term"
            termDocs=[posting[0] for posting in index[term]]
            
            #As tweet must contain ALL word in query docs = docs Instersection termDocs
            if count == 0:
                docs |= set(termDocs)
                count += 1
            else:
                docs &= set(termDocs)
        except:
            #Term not in index
            pass
        
    docs=list(docs)
    #We rank documents with TF-IDF
    ranked_docs = rankDocuments(query, docs, index, idf, tf) 
    
    return ranked_docs

def nolinks(text):
    text = text.split()
    aux = text.copy()
    for word in aux:
        if 'https' in word:
            text.remove(word)
    text = ' '.join(text)
    return text

def PrintTweets(ranked_docs,top):
    
    print("\n======================\nTop {} results out of {} for the seached query:\n".format(top, len(ranked_docs)))
    count = 0
    for d_id in ranked_docs[:top] :
        
        print('Result number:',count+1,'\n')

        if type(df_tweets.at[d_id,'extended_tweet']) == float:
            print('Tweet: ',nolinks(df_tweets.at[d_id,'text']))
        else:
            print('Tweet: ',nolinks(df_tweets.at[d_id,'extended_tweet']['full_text']))

        print('\nUsername: ',df_tweets.at[d_id,'user']['screen_name'])

        print('\nDate: ')

        if df_tweets.at[d_id,"entities"]['hashtags'] != []:
            hashs = [None]*len(df_tweets.at[d_id,"entities"]['hashtags'])
            for i in range(0,len(df_tweets.at[d_id,"entities"]['hashtags'])):
                hashs[i] = df_tweets.at[d_id,"entities"]['hashtags'][i]['text']
            print('\nHashtags: ',hashs)
        else:
            print('\nHashtags: None')

        print('\nLikes: ',df_tweets.at[d_id,'favorite_count'])

        print('\nRetweets: ',df_tweets.at[d_id,'retweet_count'])

        print('\nURL: https://twitter.com/twitter/statuses/'+str(df_tweets.at[d_id,'id']))

        print('\n----------------------\n')
        count += 1

In [14]:
with open("cluster_index.txt", "rb") as fp:   # Unpickling
    cluster_index = pickle.load(fp)

### Output diversification

In [15]:
for i in range(0,len(df_tweets)):
    df_tweets.at[i,'cluster'] = cluster_index[i]

Define a diversity score which the aim is to DIVERSIFY the final output. This score is assigned to the list of returned documents for the input query:

In [16]:
def diversify(top,n_cluster,ranked_docs_cluster):
    optimal_freq = top/n_cluster  
    c = [None]*n_cluster
    for i in ranked_docs_cluster.cluster.unique():
        c[int(i)] = ranked_docs_cluster[ranked_docs_cluster['cluster']==i][:round(optimal_freq)] 
    output = pd.concat([c[0],c[1]])
    for i in range(2,len(c)):
        output = pd.concat([output,c[i]])
    output.sort_index(inplace=True)
    return output['d_id']

In [17]:
def ClusterDiver(ranked_docs,top):
    
    ranked_docs_cluster = []

    for d_id in ranked_docs: 
        ranked_docs_cluster.append(df_tweets.at[d_id,'cluster'])

    ranked_docs_cluster = {'d_id':ranked_docs,'cluster':ranked_docs_cluster}
    ranked_docs_cluster = pd.DataFrame(ranked_docs_cluster)
    
    ranked_docs = diversify(top,len(ranked_docs_cluster.cluster.unique()),ranked_docs_cluster)
    
    return ranked_docs

In [20]:
def PrintTweetsCluster(ranked_docs):
    for d_id in ranked_docs:
        if type(df_tweets.at[d_id,'extended_tweet']) == float:
            print(nolinks(df_tweets.at[d_id,'text']),'Cluster: ',df_tweets.at[d_id,'cluster'])
            print('\n')
        else:
            print(nolinks(df_tweets.at[d_id,'extended_tweet']['full_text']),'Cluster: ',df_tweets.at[d_id,'cluster'])
            print('\n')

In [1]:
print("Insert your query:\n")
query = input()
top = 20
n_clusters = 5

ranked_docs = search_tf_idf(query, index)
ranked_docs = ClusterDiver(ranked_docs,top)
print('\n')
PrintTweetsCluster(ranked_docs)

Insert your query:

lockdown is so boring


NameError: name 'search_tf_idf' is not defined