<a href="https://colab.research.google.com/github/SEEsuite/colab_scripts/blob/main/get_twitter_edge_list.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Generate Edge List from set of Tweets

Nodes: users, labeled by username

All edges will composed of mentions, a directed edge from the author to the mentioned user

Edge Labels  = ['Direct', 'Retweet', 'Reply', 'Self']

Note - Significantly, we cannot determine a Quote tweet edge from text.

The script finds edges based on brandwatch "Full Text" Column. At the end, appends some extra columns to the edge list, so if you do not have the standard brandwatch dataset, this script might fail. Change column names as needed.



In [None]:
import re
import pandas as pd

In [None]:
# Get list of tweets


import urllib.request
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

def import_data_from_drive(share_link, your_name_for_file="my_data"):
  """Brings data file from a google drive sharepoint to your colab workspace.
     It does not require you to host the dataset on your own account.

     Parameters:
     share_link: the link to view a file in google drive
     our_name_for_file: a string describing the file, preferable endling in a file type, ex. 'data.csv'
     """
  id = share_link.split("/")[5] # separate the id from the link
  print("Using id", id, "to find file on drive")

  # use pydrive and colab modules to authenticate you
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  print("Authenticated colab user")

  # This step will move the file from Drive to the workspace
  download_object = drive.CreateFile({'id':id}) 
  download_object.GetContentFile(your_name_for_file)
  print("Added file to workspace with name", your_name_for_file)

  return



In [None]:
### HERE IS THE CELL YOU NEED TO CHANGE
link = "https://docs.google.com/spreadsheets/d/15Coav23He83KEwkr7w4b1-Z4cF9BAf8p/edit?usp=sharing&ouid=101042095541764641159&rtpof=true&sd=true"
### IF YOUR DATASET DOES NOT USE STANDARD BRANDWATCH COLUMN NAMES YOU WILL NEED TO CHANGE THE EXCEL NAMES OR THE DF NAMES BELOW

import_data_from_drive(link, your_name_for_file="tweets.xlsx")
df = pd.read_excel('tweets.xlsx')
# df = pd.read_excel('tweets.xlsx', header=8)


In [None]:
df.columns

In [None]:
# Imma do the best I can with raw text

def get_edges(row):
  tweet = row.text
  author = row.Author
  handles = []
  edge_list = []
  temp = tweet.split()

  mentions = re.findall("@[A-Za-z0-9_]+", tweet)
  num_mentions = len(mentions)


  if num_mentions < 1: #if we've found no mentions 
     edge = (author, 'self')# sometimes there will be (author, 'reply') when they start a thread.
     edge_list.append(edge)
     return edge_list

  # if there are mentions leading the tweet
  if tweet[0] == '@':   
    for i, word in enumerate(temp):
      if word[0] != '@':
        break
      edge = (word[1:], 'reply')
      edge_list.append(edge)
  elif temp[0] == 'RT':
    first_mention = re.search("@[A-Za-z0-9_]+"," ".join(temp)).group(0)
    edge_list.append((first_mention, 'Retweet'))
    temp = temp[2:]


  print(" ".join(temp))

  #TODO - check for quote connection

  num_mentions_remaining = len(re.findall("@[A-Za-z0-9_]+", " ".join(temp)))
  if  num_mentions_remaining > 0:
    for i, word in enumerate(temp):
      if word[0] != '@':
        continue
      edge = (word[1:-1], 'direct')
      edge_list.append(edge)
      temp.pop(i)
 
  if len(edge_list) < 1: #if we've found no mentions 
     edge = (author, 'self')# sometimes there will be (author, 'reply') when they start a thread.

  return edge_list



In [None]:
df['text'] = df['Full Text']

In [None]:
df['edge_list'] = df.apply(get_edges, axis=1)

In [None]:
print(df['edge_list'][9])
df.iloc[9:10]

In [None]:
enlarged_df = df.explode(column='edge_list', ignore_index=True)

In [None]:
enlarged_df.columns

In [None]:


final_df = pd.DataFrame(enlarged_df['edge_list'].tolist(), columns=['Vertex B', 'Edge Label'])

final_df.insert(0, "Vertex A",  enlarged_df['Author'])

carry_over = enlarged_df[['Date','Full Text', 'Url', 'Sentiment', "Emotion", "Language", "Country Code", "Author", "Interest", "Location Name", "Expanded URLs", "Twitter Followers", "Twitter Following" , "Twitter Likes", "Twitter Retweets", "Twitter Tweets" , "Reach (new)"]]


print(len(final_df))
print(len(carry_over))

final_df = pd.concat([final_df, carry_over], axis=1)
print(len(final_df))



In [None]:
final_df

In [None]:
final_df.to_excel("edge_list_and_twitter_attributes.xlsx")