In [1]:
from pprint import pprint


In [2]:
import numpy as np
import pandas as pd
import sqlite3
import plotly.express as px
import datetime
import time
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


In [3]:
from pathlib import Path
database = Path().resolve() / "clubhouse_archive/Clubhouse_Dataset_v1.db"
conn = sqlite3.connect(database)
cursor = conn.cursor()


In [4]:
# all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())


[('user',)]


In [5]:

df_orig = pd.read_sql("SELECT * FROM user", conn)
df_orig['time_created'] = pd.to_datetime(df_orig['time_created'])


In [6]:
df_orig.columns


Index(['user_id', 'name', 'photo_url', 'username', 'twitter', 'instagram',
       'num_followers', 'num_following', 'time_created',
       'invited_by_user_profile'],
      dtype='object')

In [7]:
df = df_orig.drop(['twitter', 'instagram', 'photo_url'], axis=1)
df.head(3)


Unnamed: 0,user_id,name,username,num_followers,num_following,time_created,invited_by_user_profile
0,4,Rohan Seth,rohan,4187268,599,2020-03-17 07:51:28.085566+00:00,
1,5,Paul Davison,paul,3718334,1861,2020-03-17 14:36:19.468976+00:00,
2,8,Johnny Appleseed,apple1,20,81,2020-03-19 19:47:00.323603+00:00,


In [8]:
invited = df.invited_by_user_profile.astype(str).apply(
    lambda x: int(x) if x != "null" else np.nan).astype('Int64')
df.invited_by_user_profile = invited


In [9]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300515 entries, 0 to 1300514
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype              
---  ------                   --------------    -----              
 0   user_id                  1300515 non-null  int64              
 1   name                     1300515 non-null  object             
 2   username                 1300515 non-null  object             
 3   num_followers            1300515 non-null  int64              
 4   num_following            1300515 non-null  int64              
 5   time_created             1300515 non-null  datetime64[ns, UTC]
 6   invited_by_user_profile  1297301 non-null  Int64              
dtypes: Int64(1), datetime64[ns, UTC](1), int64(3), object(2)
memory usage: 70.7+ MB


In [10]:
df.columns


Index(['user_id', 'name', 'username', 'num_followers', 'num_following',
       'time_created', 'invited_by_user_profile'],
      dtype='object')

### Количество зарегистрированных пользователей

In [11]:
created_times = df.sort_values('time_created')
created_times["day_created"] = created_times.time_created.dt.date
created_by_days = created_times.groupby("day_created").user_id.count()
created_cumsum_by_days = created_by_days.cumsum()
##########
uninv = created_times[created_times.invited_by_user_profile.isna()]
uninv_created_by_days = uninv.groupby("day_created").user_id.count()
uninv_created_cumsum_by_days = uninv_created_by_days.cumsum()


In [12]:
fig = go.Figure()
fig.update_layout(title_text=f"Колличество новых пользователей по дням")
fig.add_trace(go.Scatter(x=created_by_days.index, y=created_by_days))
# fig.add_trace(go.Scatter(x=uninv_created_by_days.index, y=uninv_created_by_days))


**Промежуточный вывод**: 

* взровной всплеск притока пользоватлей произошел с ноября 2021 до февраля 2022

* с 10 января приток стал увеличиваться с 5 до 16 тысяч пользователей в день (к 23 января) 

In [13]:
fig = go.Figure()
fig.update_layout(
    title_text=f"Колличество зарегистированных пользователей по дням")
fig.add_trace(go.Scatter(x=created_cumsum_by_days.index,
              y=created_cumsum_by_days))
# fig.add_trace(go.Scatter(x=uninv_created_cumsum_by_days.index, y=uninv_created_cumsum_by_days))


##### Задача: найти инсайты по пользователям, которые больше всех приводят новых пользователей

* Портрет пользователя, оказавшего большое влияние на рост аудитории.
* Корреляция с другими параметрами.
* Когда приходили наиболее влиятельные пользователи?


In [14]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300515 entries, 0 to 1300514
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype              
---  ------                   --------------    -----              
 0   user_id                  1300515 non-null  int64              
 1   name                     1300515 non-null  object             
 2   username                 1300515 non-null  object             
 3   num_followers            1300515 non-null  int64              
 4   num_following            1300515 non-null  int64              
 5   time_created             1300515 non-null  datetime64[ns, UTC]
 6   invited_by_user_profile  1297301 non-null  Int64              
dtypes: Int64(1), datetime64[ns, UTC](1), int64(3), object(2)
memory usage: 70.7+ MB


Будем считать что влияние пользователя Х на рост аудитории - сколько Х привел друзей и сколько его друзей привели его друзья.

In [39]:
inv = df[['user_id', 'invited_by_user_profile']]
inv = inv.rename(columns={"user_id": "invited",
                 "invited_by_user_profile": "inviting"})
inv = inv[~inv.inviting.isna()].reset_index(drop=True)


In [61]:
inv.inviting.unique()



<IntegerArray>
[      4,    3712,     658,     843,     881,       5,      44,      72,
      12,     163,
 ...
 1499331,  629887, 1466770, 2213492, 1200740, 1001382, 1036146, 1486093,
  656029, 2082052]
Length: 556480, dtype: Int64

In [52]:
all_invitings = inv.copy()
all_invitings.rename(columns={'invited': '2ndgen'}, inplace=True)

all_invitings = all_invitings.merge(
    inv, how='left', left_on='2ndgen', right_on='inviting', suffixes=('_left', '_right'))
all_invitings.rename(
    columns={"inviting_left": "inviting", 'invited': "3rdgen"}, inplace=True)
all_invitings.drop('inviting_right', axis=1, inplace=True)
all_invitings = all_invitings.drop_duplicates()


Unnamed: 0,2ndgen,inviting,3rdgen
0,924,4,
1,1122,3712,
2,1150,4,617000.0
3,1159,658,154764.0
4,1159,658,155634.0
...,...,...,...
2014163,1499995,2082052,
2014164,1499996,175468,
2014165,1499997,357504,307439.0
2014166,1499998,741419,


In [66]:
brought_count_by_invitors = all_invitings.groupby(
    'inviting').count()['2ndgen']  # кол-во пришедших из-за inviting
brought_count_by_invitors=brought_count_by_invitors.sort_values(ascending=False)
# brought_count_by_invitors.head(30)


In [70]:
px.histogram(brought_count_by_invitors, title='Распределение количества приведенных новых пользователей')


In [88]:
best_hubs = pd.Series(
    brought_count_by_invitors[brought_count_by_invitors > 100].index)
best_hubs.index = best_hubs.values
best_hubs_appearance = df.merge(best_hubs, left_on='user_id', right_index=True)[
    ['user_id', 'time_created']]
best_hubs_appearance.time_created = best_hubs_appearance.time_created.dt.date
best_hubs_appearance  # сгруппировать по дням, отложить на графике


Unnamed: 0,user_id,time_created
0,4,2020-03-17
1,5,2020-03-17
55,72,2020-03-30
82,104,2020-04-05
126,150,2020-04-08
...,...,...
440128,490370,2020-12-16
465794,519139,2020-12-17
474059,528267,2020-12-17
507515,565798,2020-12-18


In [86]:
fig = go.Figure()
fig.update_layout(title_text=f"Колличество новых пользователей по дням")
fig.add_trace(go.Scatter(x=created_by_days.index, y=created_by_days))
fig.add_scatter(x=best_hubs_appearance.date)