# Script to join data sources to create bipartite user,topic network edge lists.

In [1]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
import pickle    
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import re
from collections import Counter
import preprocessor as p
import glob

#from wordcloud import WordCloud
#import matplotlib.pyplot as plt

In [2]:
loadPath = "/Users/ipinni/Library/CloudStorage/OneDrive-UniversityofLeeds/UKRI_Tweet_Data/completed/"
loadPath2 = "/Users/ipinni/Library/CloudStorage/OneDrive-UniversityofLeeds/UKRI_Tweet_Data/tweets/COP/"

Loads data from COPs with single files

In [211]:
def get_data(version):

    with open(loadPath + version + "/" + version + "topics.list" ,'rb') as config_list_file:   
        topics = pickle.load(config_list_file)

    with open(loadPath + version + "/" + version + "docs.list", 'rb') as docs_list_file:   
        docs = pickle.load(docs_list_file)

    results = pd.DataFrame({"text": docs, "topic": topics})

    #tweets = pd.read_csv(loadPath + version + "/" + version + "CleanTweets.csv")

    users = pd.read_csv(loadPath2 + "tweets" + version + ".csv")
    users = users[users.sourcetweet_lang == 'en']
    users = users[["user_username", "sourcetweet_id", "sourcetweet_text"]]

    return(results, users)

In [220]:
COP20results, COP20users = get_data(version = "COP20")
COP22results, COP22users = get_data(version = "COP22")
COP23results, COP23users = get_data(version = "COP23")


Loads data from COPs with multiple files

In [264]:
def get_data2(version):

    with open(loadPath + version + "/" + version + "topics.list" ,'rb') as config_list_file:   
        topics = pickle.load(config_list_file)

    with open(loadPath + version + "/" + version + "docs.list", 'rb') as docs_list_file:   
        docs = pickle.load(docs_list_file)

    results = pd.DataFrame({"text": docs, "topic": topics})

    #get the COPs with multiple files and concat to one
    filelist=[]
    for files in glob.glob(loadPath2 + "tweets" + version + "*"):
        filelist.append(files)
    
    userslist = []
    for i in filelist:
        df = pd.read_csv(i)
        userslist.append(df)

    users = pd.concat(userslist, axis=0, ignore_index=True)
    users = users[users.sourcetweet_lang == 'en']
    users = users[["user_username", "sourcetweet_id", "sourcetweet_text"]]

    return(results, users)

In [266]:
COP21results, COP21users = get_data2(version = "COP21")
COP24results, COP24users = get_data2(version = "COP24")
COP25results, COP25users = get_data2(version = "COP25")
COP26results, COP26users = get_data2(version = "COP26")

  
  
  
  
  
  
  


Clean tweets, match tweet author to tweet to topic as determined by BERTopic

In [222]:
p.set_options(p.OPT.URL, p.OPT.RESERVED)
def merge_dfs(results, users,version):

  cleanusers = []
  for i in range(len(users)):
    cleanusers.append(p.clean(users.sourcetweet_text.iloc[i]))
  users["sourcetweet_text"] = cleanusers

  #combine the tweets and results to produce [tweet id, text, like, retweet, topic] df
  merged1 = users.merge(results.drop_duplicates(subset=['text']), left_on="sourcetweet_text", right_on= "text", how = "left")
  bipartite = merged1[["user_username", "topic"]]
  bipartite.columns = ["Source", "Target"]
  bipartite = bipartite.dropna()
  
  
  #merged2 = merged1.merge(tweets.drop_duplicates(subset=["tweet_id"]), left_on = "sourcetweet_id", right_on = "tweet_id", how = "left")

  bipartite.to_csv("Merged/" + version + "Merged.csv")

  #return(bipartite)

In [223]:
merge_dfs(COP20results, COP20users, "COP20")
merge_dfs(COP21results, COP21users, "COP21")
merge_dfs(COP22results, COP22users, "COP22")
merge_dfs(COP23results, COP23users, "COP23")
merge_dfs(COP24results, COP24users, "COP24")
merge_dfs(COP25results, COP25users, "COP25")
merge_dfs(COP26results, COP26users, "COP26") #this one takes ages

# Projections

NetworkX bipartite projections takes too much memory for anything other than COP20 dataset.

In [114]:
import networkx as nx
from networkx.algorithms import bipartite

In [117]:
def get_projections(merge):
    g = nx.Graph()
    user = []
    topic = []

    for i in range(len(merge)):
        u = merge.user_username.iloc[i]
        t = merge.topic.iloc[i]
        g.add_edge(u, t)
        user.append(u)
        topic.append(t)

    Ngraph_user = bipartite.weighted_projected_graph(g, user)

    return(Ngraph_user)

In [208]:
def get_projections2(merge, version):
    g = nx.Graph()
    user = []
    topic = []

    with open(merge, "r") as f:
        f.readline()
        for l in f:
            l = l.rstrip().split(",")
            u = str(l[1])
            t = str(l[2])
            g.add_edge(u, t)
            user.append(u)
            topic.append(t)

    Ngraph_user = bipartite.weighted_projected_graph(g, user)
    nx.write_edgelist(Ngraph_user, "Projections/" + version +"Projections.csv",  delimiter = ",")

    return(Ngraph_user)

In [209]:
version = "COP20"

In [None]:
COP20proj = get_projections2("Merged/" + version + "Merged.csv", version)

In [147]:
nx.write_edgelist(COP20proj, "Projections/" + version +"Projections.csv", delimiter = ",")

In [None]:
projs = pd.read_csv("Projections/COP20Projections.csv",sep= " " )

In [None]:
import networkx as nx
from networkx.algorithms import bipartite
g = nx.Graph()
user = []
topic = []

with open("Trimester Bipartite/SepDec17.csv", "r") as f:
    f.readline()
    for l in f:
        l = l.rstrip().split(",")
        u = str(l[0])
        t = str(l[3])
        g.add_edge(u, t)
        user.append(u)
        topic.append(t)


Ngraph_user = bipartite.weighted_projected_graph(g, user)
nx.write_edgelist(Ngraph_user, "Proj_SepDec17.csv")

In [150]:
def find_most_similar_topics(cG: nx.Graph):
    """
    Find the crimes that are most similar to other crimes.
    """
    dcs = pd.Series(nx.degree_centrality(cG))
    return dcs.sort_values(ascending=False).head(20)

find_most_similar_topics(COP20proj)

: 