In [36]:
import pandas as pd
import numpy as np
import kaleido

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

pd.options.plotting.backend = "plotly"

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import datetime
from dateutil.relativedelta import relativedelta

In [22]:
tdf = pd.read_csv('all_tweet_metrics_sorted.csv', header=0)
user_tweet_count = tdf.groupby('author_id').count().id.to_dict()
tdf.head()

Unnamed: 0,id,author_id,retweets,replies,likes,quotes,created_at
0,7094,15,9,2,4,0,2006-06-19T22:52:24.000Z
1,56935,885,0,0,1,0,2006-11-05T19:19:57.000Z
2,10581681,797223,0,0,0,0,2007-03-21T17:22:33.000Z
3,13458821,1456281,0,0,0,0,2007-03-27T05:58:28.000Z
4,22038081,2838921,0,0,0,0,2007-04-08T13:44:12.000Z


In [23]:
udf = pd.read_csv('all_users_all_metrics.csv', header=0)
udf.columns = [c.strip() for c in udf.columns]
udf.drop(labels=['pp_url', 'verified', 'protected', 'listed', 'tweets', 'followings', 'followers', 'location',
                'username'], axis=1, inplace=True)

udf.head()

Unnamed: 0,id,created_at
0,413080213,2011-11-15T13:19:06.000Z
1,493832011,2012-02-16T07:27:52.000Z
2,2989319032,2015-01-21T08:28:33.000Z
3,1042385216,2012-12-28T14:55:13.000Z
4,490149888,2012-02-12T08:22:41.000Z


In [50]:
def active_days(d_str):
    creation_date = datetime.datetime.strptime(d_str.strip().split('.')[0], '%Y-%m-%dT%H:%M:%S').date()
    today = datetime.datetime.strptime('2021-12-31', '%Y-%m-%d').date()
    diff = relativedelta(today, creation_date)
    return (diff.years - 1)# * 12 + diff.months

print(f'2011-11-15T13:19:06.000Z: {active_days("2011-11-15T13:19:06.000Z")}')
print(f'2012-02-16T07:27:52.000Z: {active_days("2012-02-16T07:27:52.000Z")}')
print(f'2015-01-21T08:28:33.000Z: {active_days("2015-01-21T08:28:33.000Z")}')
print(f'2020-01-01T08:28:33.000Z: {active_days("2020-01-01T08:28:33.000Z")}')
print(f'2020-12-20T08:28:33.000Z: {active_days("2020-12-20T08:28:33.000Z")}')

2011-11-15T13:19:06.000Z: 9
2012-02-16T07:27:52.000Z: 8
2015-01-21T08:28:33.000Z: 5
2020-01-01T08:28:33.000Z: 0
2020-12-20T08:28:33.000Z: 0


In [51]:
udf['month'] = udf.created_at.map(lambda x: x.strip()[0:7])
udf['days'] = udf.created_at.map(lambda x: active_days(x))
udf['lgbt_tweets'] = udf.id.map(lambda x: user_tweet_count[x] if x in user_tweet_count else 0)
udf['activity_ratio'] = [(udf.loc[i, 'lgbt_tweets']/ udf.loc[i, 'days']) if udf.loc[i, 'days'] != 0 else 0 for i in udf.index]
#udf = udf[udf.created_at != '1970-01']
udf.head()

Unnamed: 0,id,created_at,days,lgbt_tweets,activity_ratio,month
0,413080213,2011-11-15T13:19:06.000Z,9,1,0.111111,2011-11
1,493832011,2012-02-16T07:27:52.000Z,8,40,5.0,2012-02
2,2989319032,2015-01-21T08:28:33.000Z,5,1,0.2,2015-01
3,1042385216,2012-12-28T14:55:13.000Z,8,13741,1717.625,2012-12
4,490149888,2012-02-12T08:22:41.000Z,8,1,0.125,2012-02


In [52]:
avg = udf.groupby('month').agg({'activity_ratio': np.median})
avg.drop('1970-01', inplace=True)
avg.head()

Unnamed: 0_level_0,activity_ratio
month,Unnamed: 1_level_1
2006-03,0.142857
2006-04,0.142857
2006-05,0.071429
2006-06,0.071429
2006-07,0.071429


In [53]:
figure = px.line(avg, x=avg.index, y='activity_ratio')
figure.update_layout(
            title='Monthly Median of Related Activity by User Account Creation Month',
            xaxis_title = 'Month of Account Creation',
            yaxis_title = 'Monthly Median of Related Activity',
            legend_title = 'Legends')
figure.show()

In [20]:
indices = [True if int(udf.month.loc[i].split('-')[0]) > 2019 else False for i in udf.index]
g_data = udf[indices]
figure = px.box(g_data, x='month', y='lgbt_tweets')
# figure.update_layout(
#             title='All Time Avg. Related Tweets by User Account Creation Month',
#             xaxis_title = 'Month of Account Creation',
#             yaxis_title = 'All Time Avg. of Related Tweets',
#             legend_title = 'Legends')
figure.show()