In [6]:
import networkx as nx
import pandas as pd
from datetime import datetime
from random import sample
import matplotlib.pyplot as plt
from thesis_library import *
from tqdm import tqdm, trange
import time
import pickle
import os

def str_to_datetime(text: str):
    try:
        return datetime.strptime(text, "%Y-%m-%dT%H:%M:%SZ")
    except:
        return datetime.strptime(text, "%Y-%m-%d %H:%M:%S")
    
def sample_or_all(thing, size):
    if size>len(thing):
        return thing
    else:
        return sample(thing, size)

In [4]:
def read_data(short = True, timestamp_threshold = None) -> tuple:
    '''
    timestamp_threshold: if int, skips the checkins with almost-unique timestamps (less than timestamp_threshold times)
                         e.g. if there is a single check-in on 26th January, skip it. 
                         If None, does not perform the skipping
                         
    returns:             df_checkins, df_edges'''
    
    if not short:
        df_checkins = pd.read_table('./data/Gowalla/Gowalla_totalCheckins.txt', delimiter='\t', header=None)
        df_checkins.columns = ['user', 'time', 'latitude', 'longitude', 'location_id']
        #print(type(df_checkins['time'][0]))
        
        df_edges = pd.read_table('./data/Gowalla/Gowalla_edges.txt', delimiter='\t', header=None)
        
    else:
        df_checkins = pd.read_csv("./data/Gowalla/Gowalla_totalCheckins_short.csv")
        df_edges = pd.read_csv("./data/Gowalla/Gowalla_edges_short.csv")
    
    #Processing the dates and times
    #Setting the relative time (in days) w.r.t. the first day present in the dataframe
    df_checkins['time'] = df_checkins['time'].apply(str_to_datetime)
    min_date = df_checkins['time'].min()
    df_checkins['time_elapsed_days'] = df_checkins['time'] - min_date
    df_checkins['time_elapsed_days'] = df_checkins['time_elapsed_days'].apply(lambda x: x.days)
    
    df_checkins.sort_values(['time'], ascending=True, inplace=True)
    df_checkins.reset_index(drop=True, inplace=True)
    
    print("There are", len(df_checkins['user'].unique()), "unique users present.")
    print("There are", len(df_checkins['location_id'].unique()), "unique locations present.")
    
    #Delete rows with time_elapsed being equal to some value that appears almost no times (once, twice etc.)
    if type(timestamp_threshold) is int:
        df_checkins_grouped = df_checkins.groupby('time_elapsed_days')
        dic = dict(df_checkins_grouped.count()['user'] > timestamp_threshold)
        dic = {key: dic[key] for key in dic if dic[key] == True}
        df_checkins['correct_timestamps'] = df_checkins['time_elapsed_days'].apply(lambda x: x in dic)
        df_checkins[df_checkins['correct_timestamps'] == True]
        
    print("There are", len(df_checkins['user'].unique()), "unique users present (after thresholding).")
    print("There are", len(df_checkins['location_id'].unique()), "unique locations present (after thresholding).")
     
    return df_checkins, df_edges

In [None]:
df_checkins, df_edges = read_data(short=True, timestamp_threshold=5)

In [None]:
def create_a_friendship_graph(df_edges: pd.DataFrame, return_graph=False, max_friends=100) -> dict | nx.Graph:
    '''
    Given all the edges, constructs an object storing all friendship connections
    Warning: can be huge
    The graph version is slow af
    '''
    
    #Creates a dataframe that has nodes (0-999...) as indices and a single column with a set of friends
    df_grouped = df_edges.groupby('0').agg(set)
    
    #Converts the above dataframe into an nx.Graph that has deg(v) >= 100 nodes pruned out
    if return_graph:
        g = nx.Graph()
        for i in df_grouped.index:
            g.add_edges_from([(i, friend) for friend in list(df_grouped.loc[i])[0] \
                              if len(list(df_grouped.loc[i])[0]) < max_friends])
            
        g.remove_nodes_from([node for node in g.nodes if len(g[node]) > max_friends])
        return g
    
    #Converts the above dataframe into a dictionary that has deg(v) >= 100 nodes (keys) pruned out
    else:
        dic = {key: value for key, value in zip(df_grouped.index, list(df_grouped['1'])) if len(value) < max_friends}
        dic = {key: dic[key] for key in dic if len(dic[key]) < max_friends}
        return dic

In [None]:
#Obtaining the dictionary of friendships
#Pruning away all users with max_friends or more friends
dic = create_a_friendship_graph(df_edges, return_graph=False, max_friends=15)
df_checkins = df_checkins[df_checkins.apply(lambda x: x['user'] in dic, axis=1)]

In [None]:
def create_dataset(df_checkins: pd.DataFrame, dic: dict, sample_size=None):
    '''
    max_friends: limits the number of friends any node can have in the friendship network
    sample_size: determines the number of edges sent for every checkin created
    '''
    
    print("Can take at most", len(df_checkins)*15, " iterations.") #taken from max_friends
    
    data, label = [], [0 for _ in range(len(df_checkins))]
    
    for login in tqdm(df_checkins.iloc):
        
        #If sample size is None, create an edge for EVERY friend
        if sample_size is None:
            friends = dic[login['user']]
         #If sample size is an integer, take a sample of that many friends to add edges for them
        else:
            friends = sample_or_all(list(dic[login['user']]), sample_size)
            
        for friend in friends: #Skipping the weight = 1 here for data reduction
            data.append([login['user'], friend, login['time_elapsed_days']])
        
    return data, label

In [None]:
data, label = create_dataset(df_checkins, dic, sample_size=5)

In [2]:
#Pickling (saving) the list to the disk:

# with open('./data/Gowalla/Gowalla_short_edges_times_unplanted.txt', 'wb') as fp:
#     pickle.dump(data, fp)
#     #Works, just cannot be opened for human reader
    
with open('./data/Gowalla/Gowalla_short_edges_times_unplanted.txt', 'rb') as fp:
    X = np.array(pickle.load(fp))
    
y = [0] * len(X)

In [5]:
#Plant a handful of cliques - guide
print("We want 1% of the data to be anomalous, so a bit over", len(X)//100, "edges to be planted.")
print("Since a clique has 1/2(n^2-n) edges per n vertices, we can use 19 nodes for", 19*(19-1)//2, "edges.")
print("Then, we will repeat every edge 5 times for", 5*171, "edges per clique.")
print("We will plant them in 16 of them for", 16*171*5, "edges.")

We want 1% of the data to be anomalous, so a bit over 11192 edges to be planted.
Since a clique has 1/2(n^2-n) edges per n vertices, we can use 19 nodes for 171 edges.
Then, we will repeat every edge 5 times for 855 edges per clique.
We will plant them in 16 of them for 13680 edges.


In [10]:
plant_anomalies(X, y, dataset='Gowalla', n_imputations=16, n_vertices=19, n_repetitions=5, anomaly_type='clique')

Planting an anomaly at timestamp: 114
Planting an anomaly at timestamp: 214
Planting an anomaly at timestamp: 289
Planting an anomaly at timestamp: 292
Planting an anomaly at timestamp: 309
Planting an anomaly at timestamp: 349
Planting an anomaly at timestamp: 359
Planting an anomaly at timestamp: 363
Planting an anomaly at timestamp: 414
Planting an anomaly at timestamp: 430
Planting an anomaly at timestamp: 449
Planting an anomaly at timestamp: 533
Planting an anomaly at timestamp: 550
Planting an anomaly at timestamp: 590
Planting an anomaly at timestamp: 591
Planting an anomaly at timestamp: 618
There are 1146561 edges, out of which 27325 anomaly edges.


In [7]:
def create_uniform_data(n: int):
    '''For each node, creates 1 edge to all other nodes and saves to pickle

    n: int, the number of nodes'''
    
    print("Creating " + str(n*(n-1)//2) + " edges.")
    data = [(u, v, 1) for u in range(n) for v in range(n) if u != v]
    label = [0] * len(data)
    
    #Pickling (saving) the list to the disk:
    if 'uniform_data_' + str(n) + '.txt' not in os.listdir('./data/Uniform'):
        with open('./data/Uniform/uniform_data_'+str(n)+'.txt', 'wb') as fp:
            pickle.dump(data, fp)

In [8]:
create_uniform_data(n=700)

Creating 244650 edges.
Length of the dataset: 244650
