# Setup - RUN

In [2]:
!pip install ijson
!pip install nltk


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ijson
  Downloading ijson-3.2.0.post0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.5/112.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.2.0.post0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import ijson as ijson
import re
import pickle
import numpy as np

# Importing Required Dataframes

### Code to Generate dataframe pickle files
DO NOT RUN UNLESS REQUIRED. Skip to importing Pickle Files Directly

### Genrate Label pickle
Generate label_df. This contains user ids as index and labels as human or bot. 11826 entries

In [5]:
# Load label data
LABEL_PATH = '/content/drive/MyDrive/Twibot-20/label.csv'
df_label = pd.read_csv(LABEL_PATH)
df_label.set_index('id', inplace=True)
df_label

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
u17461978,human
u1297437077403885568,bot
u17685258,human
u15750898,human
u1659167666,bot
...,...
u452754350,bot
u850435801687183362,bot
u2188795745,bot
u940687680,bot


In [6]:
with open('label_pd.pickle', 'wb') as f:
    # Use the pickle.dump() function to save the dictionary to the file
    pickle.dump(df_label, f)

### Generate final Edge_file pickle

In [7]:
edge_path = '/content/drive/MyDrive/Twibot-20/edge.csv'
edge_file = pd.read_csv(edge_path, index_col = "target_id")
edge_file = edge_file[edge_file['relation'] == 'post']
edge_file = edge_file.drop(['relation'], axis = 1)
edge_file = edge_file[edge_file['source_id'].isin(df_label.index)]


with open('edge_file.pickle', 'wb') as f:
    pickle.dump(edge_file, f)

## Import Required Dataframes through pickle files

### Edge_file
Edge_file = contains tweet ids as index and user ids as column values in a pd df

In [8]:
edge_file_path = '/content/drive/MyDrive/Twibot-20/final_dataframes/edge_file.pickle'
edge_file = pd.read_pickle(edge_file_path)
edge_file

Unnamed: 0_level_0,source_id
target_id,Unnamed: 1_level_1
t0,u17461978
t1,u17461978
t2,u17461978
t3,u17461978
t4,u17461978
...,...
t1999783,u3385331674
t1999784,u3385331674
t1999785,u3385331674
t1999786,u3385331674


### Label df
Contains labels 

In [9]:
label_path = '/content/drive/MyDrive/Twibot-20/final_dataframes/edge_file.pickle'
label_df = pd.read_pickle(label_path)
label_df

Unnamed: 0_level_0,source_id
target_id,Unnamed: 1_level_1
t0,u17461978
t1,u17461978
t2,u17461978
t3,u17461978
t4,u17461978
...,...
t1999783,u3385331674
t1999784,u3385331674
t1999785,u3385331674
t1999786,u3385331674


# Helper Methods

In [10]:
def num_links(tweet_text: str) -> int:
    return len(re.findall(r'https?://\S+', tweet_text))

def num_hashtags(tweet_text: str) -> int:
    return len(re.findall(r'#\S+', tweet_text))

def num_characters(tweet_text: str) -> int:
    return len(tweet_text)

def num_words(tweet_text: str) -> int:
    return len(tweet_text.split(' '))
    
def num_mentions(tweet_text: str) -> int:
    return len(re.findall(r'@\S+', tweet_text))

def is_retweet(tweet_text: str) ->int:
    return int(tweet_text[0:2] == "RT")

**Welfords Algorithm to calculate running mean and std**

In [11]:
# For a new value newValue, compute the new count, new mean, the new M2.
# mean accumulates the mean of the entire dataset
# M2 aggregates the squared distance from the mean
# count aggregates the number of samples seen so far
def update(existingAggregate, newValue):
    (count, mean, M2) = existingAggregate
    delta = newValue - mean
    mean += delta / count
    delta2 = newValue - mean
    M2 += delta * delta2
    if count < 2:
      sampleVariance = 0
    else:
      sampleVariance =  M2 / (count - 1)
    return (mean, M2, sampleVariance)

**Cosine Distance for String Similarty**

In [12]:
#Using glove pretrained word vectors https://nlp.stanford.edu/projects/glove/
glove_path = "/content/drive/MyDrive/GloVe/glove.6B.200d.txt"

# bottom code is taken from https://www.kaggle.com/code/adepvenugopal/nlp-text-similarity-using-glove-embedding

def preprocess(tweet, model):
  import re
  from nltk.corpus import stopwords
  import pandas as pd
  words = re.sub("[^a-zA-Z]", " ", tweet).lower().split() #reformats tweet to only keep letters, and lowercase and splits them into a list
  # remove stopwords
  stopword_set = set(stopwords.words("english"))


  # return clean words
  return list(set([w for w in words if (w not in stopword_set) and (w in model.keys())]))


def load_glove_vectors(path):
  with open(path, encoding="utf8" ) as f:
    content = f.readlines()
  model = {}
  for line in content:
    splitLine = line.split()
    word = splitLine[0]
    embedding = np.array([float(val) for val in splitLine[1:]])
    model[word] = embedding
  return model

def cosine_distance_wordembedding_method(s1, s2, model):
  import scipy
  vector_1 = np.mean([model[word] for word in preprocess(s1, model)],axis=0)
  vector_2 = np.mean([model[word] for word in preprocess(s2, model)],axis=0)
  if vector_1.ndim != 1 or vector_2.ndim != 1:
    return 0 # this is not ideal, but sometimes our preprocessing can yield empty vectors because every word is not a real english word
  #print(vector_1.shape, vector_2.shape)
  cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
  #print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
  return cosine



In [13]:
# loading GloVe model so it doesn't have to be loaded again
import itertools
import nltk
nltk.download('stopwords')
rolling_size = 5 # cosine_sim does not compute until we have traversed at least 10 tweets from a user
model = load_glove_vectors(glove_path)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Main Code for Extracting Features

In [23]:
#links user to their expanded tweet characteristics

tweet_stats = {}
DATA_PATH = '/content/drive/MyDrive/Twibot-20/node.json'
with open(DATA_PATH, 'rb') as f:
  #process each record
    counter = 0
    items = ijson.items(f, "item")
    #len_items = len(items)
    #print(len_items)
    for record in items:
      #print("iter ", counter)
      
      
      #if counter % 10000 == 0:
        #print("Percentage complete:", counter / len_items, "%")

      #link the tweet to user through edge_file
      if "t" in record["id"]:
        if record["id"] in edge_file.index:
          user = edge_file.loc[record["id"], 'source_id']
        else:
          continue
        counter+=1
        if counter % 1000 == 0:
          print("Percentage complete:", (counter / 1999787) * 100, "%")
        #print(counter)
        # #check if user exists in labeled dataset
        # if user not in df_label.index:
        #   continue

        #create user if does not exist
        if user not in tweet_stats.keys():
          tweet_stats[user] = {"Number": 0, 
                               "Links_mean": 0, 
                               "Links_std" : 0,
                               "Links_M2" : 0,
                               "Words_std": 0,
                               "Words_mean": 0,
                               "Words_M2": 0,
                               "Mentions_std": 0,
                               "Mentions_mean": 0,
                               "Mentions_M2" : 0,
                               "Hashtags_std": 0,
                               "Hashtags_mean": 0,
                               "Hashtags_M2" : 0,
                               "RT": 0,
                               "Prev_Tweets": [], #TODO: keep a rolling 10 tweets to perform pairwise distance calculations to avg
                               "Cosine_mean": 0,
                               "Cosine_std": 0,
                               "Cosine_M2": 0}

        # update tweet feature statistics for user
        if tweet_stats[user]["Number"] == 0:
            # first tweet for this user, initialize means and stds
            tweet_stats[user]["Number"] += 1
            tweet_stats[user]["Links_mean"] = num_links(record["text"])
            tweet_stats[user]["Hashtags_mean"] = num_hashtags(record["text"])
            tweet_stats[user]["Mentions_mean"]= num_mentions(record["text"])
            tweet_stats[user]["Words_mean"]= num_words(record["text"])
            tweet_stats[user]["RT"] += is_retweet(record["text"])
            tweet_stats[user]['Prev_Tweets'].append(record['text'])
        else:

            # first increase number
            tweet_stats[user]["Number"] += 1

            # now update mean, M2 and variance in one call
            
            # if we have collected enough tweets, calculate cosine dist and avg 
            tweet_stats[user]['Prev_Tweets'].append(record['text']) #add current tweet 

            if len(tweet_stats[user]['Prev_Tweets']) >= rolling_size: # if our size is now the rolling size, perform calc, than pop the lowest  
                combos = itertools.combinations(tweet_stats[user]['Prev_Tweets'], 2)
                avg = 0
                for tweet1, tweet2 in combos:
                  avg += cosine_distance_wordembedding_method(tweet1, tweet2, model)
                avg = avg / rolling_size
                #print("avg:", avg)
                newAggregate = update((tweet_stats[user]["Number"], tweet_stats[user]["Cosine_mean"], tweet_stats[user]["Cosine_M2"]), num_links(record["text"]))
                tweet_stats[user]["Cosine_mean"], tweet_stats[user]["Cosine_M2"], tweet_stats[user]["Cosine_std"] = newAggregate
                tweet_stats[user]['Prev_Tweets'].pop(0)
            else: #no calc, collect tweets
                tweet_stats[user]['Prev_Tweets'].append(record['text'])

            #update Links
            newAggregate = update((tweet_stats[user]["Number"], tweet_stats[user]["Links_mean"], tweet_stats[user]["Links_M2"]), num_links(record["text"]))
            tweet_stats[user]["Links_mean"], tweet_stats[user]["Links_M2"], tweet_stats[user]["Links_std"] = newAggregate

            #update Words
            newAggregate = update((tweet_stats[user]["Number"], tweet_stats[user]["Words_mean"], tweet_stats[user]["Words_M2"]), num_words(record["text"]))
            tweet_stats[user]["Words_mean"], tweet_stats[user]["Words_M2"], tweet_stats[user]["Words_std"] = newAggregate

            #update Mentions 
            newAggregate = update((tweet_stats[user]["Number"], tweet_stats[user]["Mentions_mean"], tweet_stats[user]["Mentions_M2"]), num_links(record["text"]))
            tweet_stats[user]["Mentions_mean"], tweet_stats[user]["Mentions_M2"], tweet_stats[user]["Mentions_std"] = newAggregate

            #update Hashtags
            newAggregate = update((tweet_stats[user]["Number"], tweet_stats[user]["Hashtags_mean"], tweet_stats[user]["Hashtags_M2"]), num_links(record["text"]))
            tweet_stats[user]["Hashtags_mean"], tweet_stats[user]["Hashtags_M2"], tweet_stats[user]["Hashtags_std"] = newAggregate

            #update RT count
            tweet_stats[user]["RT"] += is_retweet(record["text"])

Percentage complete: 0.0500053255671729 %
Percentage complete: 0.1000106511343458 %
Percentage complete: 0.1500159767015187 %
Percentage complete: 0.2000213022686916 %
Percentage complete: 0.25002662783586455 %
Percentage complete: 0.3000319534030374 %
Percentage complete: 0.35003727897021036 %
Percentage complete: 0.4000426045373832 %
Percentage complete: 0.4500479301045562 %
Percentage complete: 0.5000532556717291 %
Percentage complete: 0.550058581238902 %
Percentage complete: 0.6000639068060748 %
Percentage complete: 0.6500692323732478 %
Percentage complete: 0.7000745579404207 %
Percentage complete: 0.7500798835075936 %
Percentage complete: 0.8000852090747664 %
Percentage complete: 0.8500905346419394 %
Percentage complete: 0.9000958602091124 %
Percentage complete: 0.9501011857762852 %
Percentage complete: 1.0001065113434582 %
Percentage complete: 1.050111836910631 %
Percentage complete: 1.100117162477804 %
Percentage complete: 1.1501224880449767 %
Percentage complete: 1.200127813612

Create a pandas dataframe

In [30]:
pd_tweet_stats = pd.DataFrame.from_dict(tweet_stats, orient='index')
pd_tweet_stats

Unnamed: 0,Number,Links_mean,Links_std,Links_M2,Words_std,Words_mean,Words_M2,Mentions_std,Mentions_mean,Mentions_M2,Hashtags_std,Hashtags_mean,Hashtags_M2,RT,Prev_Tweets,Cosine_mean,Cosine_std,Cosine_M2
u17461978,200,1.035000,0.466106,92.755000,155.024095,24.355000,30849.795000,0.460704,1.040000,91.680000,0.466106,1.035000,92.755000,33,[RT @ValentinoKhan: HOT SAUCE RADIO W THE ONE ...,1.015000,0.467111,92.955000
u17685258,200,0.475000,0.300879,59.875000,101.380804,19.810000,20174.780000,0.300879,0.475000,59.875000,0.300879,0.475000,59.875000,130,[Over one-third of America. He is straight out...,0.465000,0.300276,59.755000
u15750898,200,0.865000,0.318367,63.355000,90.284196,24.415000,17966.555000,0.322010,0.860000,64.080000,0.322010,0.860000,64.080000,76,[RT @PR_NHL: Brayden Point continues to leave ...,0.850000,0.329146,65.500000
u1659167666,88,0.647727,0.230799,20.079545,45.558516,13.431818,3963.590909,0.227273,0.659091,19.772727,0.230799,0.647727,20.079545,46,[When you locking the doors at 10 and a custom...,0.625000,0.237069,20.625000
u34743251,200,0.890000,0.470251,93.580000,96.887211,17.415000,19280.555000,0.470251,0.890000,93.580000,0.474146,0.885000,94.355000,51,"[The deadline to submit is Wednesday, May 20 a...",0.870000,0.475477,94.620000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u452754350,200,0.835000,0.259070,51.555000,81.134774,20.470000,16145.820000,0.265729,0.840000,52.880000,0.262412,0.830000,52.220000,40,[Here is my much-belated analysis of the 2019 ...,0.825000,0.265704,52.875000
u850435801687183362,200,0.540000,0.269749,53.680000,90.186910,16.095000,17947.195000,0.278894,0.550000,55.500000,0.269749,0.540000,53.680000,30,"[@joshyto @WatchTheBreaks Damn, no go for me.....",0.535000,0.270126,53.755000
u2188795745,200,0.365000,0.333442,66.355000,102.492236,15.515000,20395.955000,0.346106,0.375000,68.875000,0.333442,0.365000,66.355000,94,[RT @OriginalFunko: RT &amp; follow @OriginalF...,0.360000,0.332060,66.080000
u940687680,145,0.262069,0.208621,30.041379,44.658333,16.600000,6430.800000,0.211877,0.268966,30.510345,0.208621,0.262069,30.041379,133,[RT @esrefziya: #dersahanemolmasaydı bugun ben...,0.262069,0.208621,30.041379


Drop unwanted columns of M2 from Dataset

In [31]:
pd_tweet_stats = pd_tweet_stats.drop(['Words_M2', 'Mentions_M2', 'Hashtags_M2', 'Links_M2', 'Cosine_M2', 'Prev_Tweets'], axis = 1)

View the final Dataset

In [32]:
pd_tweet_stats

Unnamed: 0,Number,Links_mean,Links_std,Words_std,Words_mean,Mentions_std,Mentions_mean,Hashtags_std,Hashtags_mean,RT,Cosine_mean,Cosine_std
u17461978,200,1.035000,0.466106,155.024095,24.355000,0.460704,1.040000,0.466106,1.035000,33,1.015000,0.467111
u17685258,200,0.475000,0.300879,101.380804,19.810000,0.300879,0.475000,0.300879,0.475000,130,0.465000,0.300276
u15750898,200,0.865000,0.318367,90.284196,24.415000,0.322010,0.860000,0.322010,0.860000,76,0.850000,0.329146
u1659167666,88,0.647727,0.230799,45.558516,13.431818,0.227273,0.659091,0.230799,0.647727,46,0.625000,0.237069
u34743251,200,0.890000,0.470251,96.887211,17.415000,0.470251,0.890000,0.474146,0.885000,51,0.870000,0.475477
...,...,...,...,...,...,...,...,...,...,...,...,...
u452754350,200,0.835000,0.259070,81.134774,20.470000,0.265729,0.840000,0.262412,0.830000,40,0.825000,0.265704
u850435801687183362,200,0.540000,0.269749,90.186910,16.095000,0.278894,0.550000,0.269749,0.540000,30,0.535000,0.270126
u2188795745,200,0.365000,0.333442,102.492236,15.515000,0.346106,0.375000,0.333442,0.365000,94,0.360000,0.332060
u940687680,145,0.262069,0.208621,44.658333,16.600000,0.211877,0.268966,0.208621,0.262069,133,0.262069,0.208621


Save the dataset into a pickle file.

In [34]:
with open('tweet_stats_pd.pickle', 'wb') as f:
    # Use the pickle.dump() function to save the dictionary to the file
    pickle.dump(pd_tweet_stats, f)