In [132]:
import pandas as pd
import geopandas as gpd
import numpy as np
import plotly.express as px
import plotly.graph_objs as pgo
import plotly.figure_factory as ff
import plotly
import tweetdf
import math
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import haversine_distances
from datetime import timedelta
import apyori

In [2]:
tweets = tweetdf.from_csv("../../dbs/Sweden.csv")

In [3]:
tweets.head(1)

Unnamed: 0,userid,tweetid,createdat,latitude,longitude,month,weekday,hourofday,timezone
0,14153467,24287237314,2010-09-12 16:23:34+02:00,59.62227,16.50337,9.0,0.0,16.0,Europe/Stockholm


# Quantiles

We split the users into quantiles based on the number of tweets they have. We think that the temporal patterns might differ a lot between users with different amount of tweets.

In [4]:
# Count tweets per user
tweets = tweets.merge(
    tweets.groupby('userid', as_index=False).count()[['userid', 'tweetid']].rename(columns={'tweetid': 'tweet_count'}),
    on='userid'
)

In [5]:
# Assign each user to a quantile based on number of tweets
tweets = tweets.assign(quantile=pd.qcut(tweets['tweet_count'], q=5))

In [6]:
tweets.groupby('quantile').size()

quantile
(4.999, 281.0]      255425
(281.0, 543.0]      255143
(543.0, 891.0]      257120
(891.0, 1503.0]     257624
(1503.0, 4778.0]    250703
dtype: int64

In [7]:
tweets = tweets.set_index(['userid'])

To ease the analysis we pick one user from each quantile.

In [8]:
usertweets = tweets.loc[tweets.groupby('quantile').head(1).index]

In [9]:
usertweets = usertweets.reset_index().set_index(['userid', 'tweetid'])

In [10]:
usertweets.shape

(4482, 9)

In [11]:
usertweets.groupby('userid').size()

userid
11773412     633
14153467     479
18192787    2182
38429882    1034
49959063     154
dtype: int64

# Spatial clustering

We cluster the tweets into regions with DBSCAN, and assign each tweet the region it belongs to.

In [12]:
def cluster_spatial(tws, eps_km=0.1, min_samples=1):
    kms_per_radian = 6371.0088
    coords = np.radians(tws[['latitude', 'longitude']].values)
    return DBSCAN(
        eps=eps_km/kms_per_radian, 
        min_samples=min_samples, 
        metric='haversine',
    ).fit(coords).labels_

In [123]:
clusters = pd.Series(index=usertweets.index, dtype=int)
for uid in usertweets.index.get_level_values(0).unique():
    clusters.loc[uid] = cluster_spatial(usertweets.loc[uid])
usertweets = usertweets.assign(region=clusters.astype(int))

In [125]:
usertweets.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,createdat,latitude,longitude,month,weekday,hourofday,timezone,tweet_count,quantile,region
userid,tweetid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
14153467,24287237314,2010-09-12 16:23:34+02:00,59.62227,16.50337,9.0,0.0,16.0,Europe/Stockholm,479,"(281.0, 543.0]",0


# Gaps
To find sequential patterns we construct "gaps" from the tweets. A gap is a pair of tweets with a duration between them.

In [137]:
def gaps(df):
    """
    Find gaps between consecutive tweets. 
    Keeps all existing columns suffixed with (_origin, _destination).
    Adds a new columns "duration" which is the timedelta between the two tweets.
    Assumes the dataframe is sorted on createdat column.
    """
    df_or = df.shift(1).dropna().reset_index(drop=True)
    df_ds = df.shift(-1).dropna().reset_index(drop=True)
    df = df_or.join(df_ds, lsuffix="_origin", rsuffix="_destination")
    df = df.assign(duration=df['createdat_destination'] - df['createdat_origin'])
    df['region_origin'] = df['region_origin'].astype(int)
    df['region_destination'] = df['region_destination'].astype(int)
    return df

In [138]:
usergaps = usertweets.groupby('userid').apply(lambda df: gaps(df.sort_values('createdat')))

In [139]:
usergaps.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,createdat_origin,latitude_origin,longitude_origin,month_origin,weekday_origin,hourofday_origin,timezone_origin,tweet_count_origin,quantile_origin,region_origin,...,latitude_destination,longitude_destination,month_destination,weekday_destination,hourofday_destination,timezone_destination,tweet_count_destination,quantile_destination,region_destination,duration
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
11773412,0,2010-09-13 17:21:37-04:00,39.942592,-75.157083,9.0,1.0,17.0,America/New_York,633.0,"(543.0, 891.0]",0,...,40.12421,-74.962517,9.0,1.0,18.0,America/New_York,633.0,"(543.0, 891.0]",1,01:26:04


## Duration

The distibution of duration is VERY long tailed, with durations over 1 year. However the majority of gaps are of smaller duration (<20hr) which could still be useful for analysis.

In [140]:
duration_hists = []
for uid in usergaps.index.get_level_values(0).unique():
    d = usergaps.loc[uid]['duration'].astype('<m8[h]')
    d = d[d < 24*7]
    duration_hists.append(
        pgo.Histogram(
            x=d,
            histnorm="percent",
            opacity=0.75,
            name=uid,
            nbinsx=24*7,
        ),
    )

pgo.Figure(
    data=duration_hists,
    layout=pgo.Layout(
        barmode='overlay',
        xaxis=dict(
            title='gap duration (hours)'
        ),
        yaxis=dict(
            title='percentage of gaps'
        )
    ),
)

## Region ODM

Looking for any easy-to-glance patterns from an ODM matrix. Only including gaps shorter than 24 hours and ignoring circles (same region).

Some patterns emerge, most specifically regions with "towers", which have been the destination of several other regions. Cross checking with previous analyses these correspond to home or work location. 

Some of the other towers have not been verified what they are, but a hypothesis is that they are secondary homelocations or previous workplaces. Needs to be verified.

In [199]:
def region_transition_probability(df):
    n_regions = max(df['region_origin'].max(), df['region_destination'].max())
    indx = pd.MultiIndex.from_product([np.arange(0, n_regions+1), np.arange(0, n_regions+1)])
    mtr = df.groupby(['region_origin', 'region_destination']).size().reindex(indx).unstack().fillna(0)
    rowsum = mtr.sum(axis=1)
    rowsum[rowsum == 0] = 1
    mtr = mtr.div(rowsum, axis=0)
    return mtr.values

In [235]:
c = px.colors.sequential.Viridis
def logprobscale(original_scale=px.colors.sequential.Viridis):
    return list(zip(np.concatenate(([0], 1/(2**np.arange(len(c)-2,0, step=-1)), [1])), c))

fig = plotly.subplots.make_subplots(
    rows=3, 
    cols=2, 
    subplot_titles=usergaps.index.get_level_values(0).unique().astype(str),
    vertical_spacing=0.04
)

i = 0
for uid in usergaps.index.get_level_values(0).unique():
    d = usergaps.loc[uid]
    d = d[d['duration'] < timedelta(hours=24)]
    d = d[d['region_origin'] != d['region_destination']]
    col = (i % 2) + 1
    row = math.floor(i / 2) + 1
    fig.append_trace(
        pgo.Heatmap(
            z=region_transition_probability(d),
            colorscale=logprobscale(),
        ),
        row,
        col,
    )
    i += 1
fig.layout.update(height=1000)
fig.show()

## Frequent item sets

Using the Apriori algorithm, which finds items (regions) that are oftenly visited together during the same day. There are two levels of details readily available: support based and confidence/lift based.

**Support**
Each item set have a *support*, denoting the number of itemsets this itemset is a subset of.

**Confidence/lift**
Each item set is split into *base-set* and *add-set* (~combinatorically). Each of these have a *confidence*, denoting the probability that given base-set you will observe add-set, and a *lift*, denoting how much more likely you are to observe the add-set given the base-set than just by itself.


In [186]:
def itemsets(gapsdf):
    # group by userid and date YYYY-MM-DD
    groups = ['userid', gapsdf['createdat_origin'].apply(lambda x: x.date())]
    def regionsets(df):
        origins = df['region_origin'].tolist()
        last_destination = df.tail(1)['region_destination'].tolist()
        return origins + last_destination
    return gapsdf.groupby(groups).apply(regionsets)

def apriori(gapsdf, min_support=0.005, min_confidence=0.3):
    itemsetsdf = itemsets(usergaps)
    d = []
    for uid in itemsetsdf.index.get_level_values(0).unique():
        res = list(apyori.apriori(
            itemsetsdf.loc[uid].tolist(),
            min_support=min_support,
            min_confidence=min_confidence,
        ))
        for r in res:
            d.append([
                uid,
                [x for x in r.items],
                len(r.items),
                r.support,
            ])
    df = pd.DataFrame(d, columns=['userid', 'itemset', 'length', 'support'])
    return df.set_index('userid')

def apriori_detail(gapsdf, min_support=0.005, min_confidence=0.3):
    itemsetsdf = itemsets(usergaps)
    d = []
    for uid in itemsetsdf.index.get_level_values(0).unique():
        res = list(apyori.apriori(
            itemsetsdf.loc[uid].tolist(),
            min_support=min_support,
            min_confidence=min_confidence,
        ))
        for r in res:
            for x in r.ordered_statistics:
                d.append([
                    uid,
                    r.support,
                    len(x.items_base),
                    len(x.items_add),
                    [s for s in x.items_base],
                    [s for s in x.items_add],
                    x.confidence,
                    x.lift,
                ])
    df = pd.DataFrame(d, columns=['userid', 'support', 'baselen', 'addlen', 'base', 'add', "confidence", "lift"])
    return df.set_index('userid')

### Support based view

In [187]:
useritemsets = apriori(usergaps, min_support=0.005, min_confidence=0.01)

In [191]:
useritemsets.groupby(['userid', 'length']).apply(lambda df: df.nlargest(2, 'support')[['itemset', 'support']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemset,support
userid,length,userid,Unnamed: 3_level_1,Unnamed: 4_level_1
11773412,1,11773412,[56],0.290055
11773412,1,11773412,[57],0.290055
11773412,2,11773412,"[56, 57]",0.088398
11773412,2,11773412,"[56, 106]",0.013812
11773412,3,11773412,"[56, 57, 95]",0.008287
11773412,3,11773412,"[56, 57, 117]",0.005525
14153467,1,14153467,[121],0.247104
14153467,1,14153467,[7],0.204633
14153467,2,14153467,"[22, 7]",0.054054
14153467,2,14153467,"[70, 7]",0.027027


### Confidence based view

In [194]:
apriori_detail(usergaps, min_confidence=0.1) \
    .groupby(['userid', 'baselen', 'addlen']) \
    .apply(lambda df: df.nlargest(3, ['support', 'confidence']))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,support,baselen,addlen,base,add,confidence,lift
userid,baselen,addlen,userid,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11773412,0,1,11773412,0.290055,0,1,[],[56],0.290055,1.0
11773412,0,1,11773412,0.290055,0,1,[],[57],0.290055,1.0
11773412,1,1,11773412,0.088398,1,1,[56],[57],0.304762,1.050703
11773412,1,1,11773412,0.088398,1,1,[57],[56],0.304762,1.050703
11773412,1,1,11773412,0.013812,1,1,[106],[56],0.5,1.72381
11773412,1,2,11773412,0.008287,1,2,[95],"[56, 57]",0.75,8.484375
11773412,1,2,11773412,0.005525,1,2,[117],"[56, 57]",0.2,2.2625
11773412,2,1,11773412,0.008287,2,1,"[57, 95]",[56],1.0,3.447619
11773412,2,1,11773412,0.008287,2,1,"[56, 95]",[57],0.75,2.585714
11773412,2,1,11773412,0.005525,2,1,"[56, 117]",[57],0.5,1.72381
