# Guide Book Data Science

In [1]:
from collections import Counter, defaultdict
from typing import List, Dict, Any
import pandas as pd

from input_data import INPUT_DATA

pd.set_option('precision', 2)

print(INPUT_DATA[:10])

[{'event': 'GuideSession', 'properties': {'user_id': 757}}, {'event': 'ConnectionRequested', 'properties': {'user_id': 194}}, {'event': 'GuideSession', 'properties': {'user_id': 1656}}, {'event': 'GuideSession', 'properties': {'user_id': 563}}, {'event': 'GuideSession', 'properties': {'user_id': 1276}}, {'event': 'GuideSession', 'properties': {'user_id': 1077}}, {'event': 'GuideDownload', 'properties': {'user_id': 2109}}, {'event': 'GuideSession', 'properties': {'user_id': 918}}, {'event': 'GuideSession', 'properties': {'user_id': 1349}}, {'event': 'GuideSession', 'properties': {'user_id': 525}}]


In [2]:
for event_metric in INPUT_DATA[:30]:
    print(f"{event_metric['event']} ==> {event_metric['properties']['user_id']}")

GuideSession ==> 757
ConnectionRequested ==> 194
GuideSession ==> 1656
GuideSession ==> 563
GuideSession ==> 1276
GuideSession ==> 1077
GuideDownload ==> 2109
GuideSession ==> 918
GuideSession ==> 1349
GuideSession ==> 525
GuideSession ==> 1223
GuideSession ==> 238
GuideSession ==> 542
GuideSession ==> 2429
GuideSession ==> 2331
GuideSession ==> 1625
GuideSession ==> 2262
GuideSession ==> 230
GuideSession ==> 1477
GuideSession ==> 1602
GuideSession ==> 718
GuideSession ==> 1806
GuideSession ==> 1001
GuideSession ==> 317
PhotoUpload ==> 109
GuideSession ==> 781
GuideSession ==> 1256
GuideSession ==> 2131
GuideDownload ==> 1298
GuideSession ==> 1808


In [3]:
user_ids = []
for event_metric in INPUT_DATA:
    user_ids.append(event_metric['properties']['user_id'])
print(user_ids[:30])

[757, 194, 1656, 563, 1276, 1077, 2109, 918, 1349, 525, 1223, 238, 542, 2429, 2331, 1625, 2262, 230, 1477, 1602, 718, 1806, 1001, 317, 109, 781, 1256, 2131, 1298, 1808]


In [4]:
user_ids_unique = set(user_ids)
user_ids[:20]

[757,
 194,
 1656,
 563,
 1276,
 1077,
 2109,
 918,
 1349,
 525,
 1223,
 238,
 542,
 2429,
 2331,
 1625,
 2262,
 230,
 1477,
 1602]

In [5]:
print(len(user_ids))
print(len(user_ids_unique))

58859
2500


In [6]:
def events_per_user(event_metrics: List[Dict[str, Any]]) -> defaultdict:
    user_events = defaultdict(list)
    for event_metric in event_metrics:
        user_events[event_metric['properties']['user_id']].append(event_metric['event'])
    
    return user_events

user_events = events_per_user(INPUT_DATA)
# print(user_events)

In [7]:
def user_events_trigger_count(user_events: defaultdict) -> dict:
    user_events_triggers = {}
    for user, events in user_events.items():
        user_events_triggers[user] = Counter(events)
    return user_events_triggers

user_events_triggers = user_events_trigger_count(user_events)
# print(user_events_triggers)
# for user, triggers in user_events_triggers.items():
#     print(user)
#     print(triggers.keys())
#     print(triggers.values())
#     print("=================")

In [8]:
def user_event_times_count(user_events_triggers: dict) -> dict:
    user_event_times = defaultdict(dict)
    for user, events in user_events_triggers.items():
        user_event_times[user]["events"] = {}
        for ev in events:
            user_event_times[user]["events"].update({ev: events[ev]})
    return dict(user_event_times)
user_event_times = user_event_times_count(user_events_triggers)
# print(user_event_times)

In [9]:
def generate_user_events_dataset(user_event_times: dict) -> dict:
    user_events_ds = {}
    for user, triggers in user_event_times.items():
        user_events_ds[user] = pd.Series(data = triggers['events'].values(), index = triggers['events'].keys())
    return user_events_ds

user_events_ds = generate_user_events_dataset(user_event_times)
# print(user_events_ds)

In [10]:
def generate_data_frame(dataset: dict) -> pd.DataFrame:
    df = pd.DataFrame(dataset)
    return df
df = generate_data_frame(user_events_ds)
df

Unnamed: 0,757,194,1656,563,1276,1077,2109,918,1349,525,...,2494,1563,1455,792,2213,2139,2275,1357,1040,1789
ConnectionRequested,,1,,,,,,,,,...,,,,,,,,,,
GuideDownload,2.0,1,1.0,1.0,1.0,2.0,3.0,2.0,1.0,1.0,...,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
GuideSession,22.0,5,32.0,11.0,13.0,26.0,4.0,26.0,14.0,18.0,...,3.0,,,1.0,2.0,1.0,1.0,,,
PhotoUpload,,14,,2.0,,,,,,1.0,...,,,,,,,,,,


In [11]:
df = df.T
df

Unnamed: 0,ConnectionRequested,GuideDownload,GuideSession,PhotoUpload
757,,2.0,22.0,
194,1.0,1.0,5.0,14.0
1656,,1.0,32.0,
563,,1.0,11.0,2.0
1276,,1.0,13.0,
...,...,...,...,...
2139,,1.0,1.0,
2275,,1.0,1.0,
1357,,1.0,,
1040,,1.0,,


In [12]:
df.fillna(0, inplace=True)
df = df.astype('int64')
df

Unnamed: 0,ConnectionRequested,GuideDownload,GuideSession,PhotoUpload
757,0,2,22,0
194,1,1,5,14
1656,0,1,32,0
563,0,1,11,2
1276,0,1,13,0
...,...,...,...,...
2139,0,1,1,0
2275,0,1,1,0
1357,0,1,0,0
1040,0,1,0,0


In [13]:
df.describe()

Unnamed: 0,ConnectionRequested,GuideDownload,GuideSession,PhotoUpload
count,2500.0,2500.0,2500.0,2500.0
mean,0.44,1.44,19.66,2.01
std,1.14,0.79,10.59,4.79
min,0.0,0.0,0.0,0.0
25%,0.0,1.0,13.0,0.0
50%,0.0,1.0,19.0,0.0
75%,0.0,2.0,24.0,1.0
max,11.0,6.0,161.0,24.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2500 entries, 757 to 1789
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   ConnectionRequested  2500 non-null   int64
 1   GuideDownload        2500 non-null   int64
 2   GuideSession         2500 non-null   int64
 3   PhotoUpload          2500 non-null   int64
dtypes: int64(4)
memory usage: 162.2 KB


In [15]:
df.head(10)

Unnamed: 0,ConnectionRequested,GuideDownload,GuideSession,PhotoUpload
757,0,2,22,0
194,1,1,5,14
1656,0,1,32,0
563,0,1,11,2
1276,0,1,13,0
1077,0,2,26,0
2109,0,3,4,0
918,0,2,26,0
1349,0,1,14,0
525,0,1,18,1


In [16]:
df.tail(10)

Unnamed: 0,ConnectionRequested,GuideDownload,GuideSession,PhotoUpload
2494,0,1,3,0
1563,0,1,0,0
1455,0,3,0,0
792,0,1,1,0
2213,0,1,2,0
2139,0,1,1,0
2275,0,1,1,0
1357,0,1,0,0
1040,0,1,0,0
1789,0,1,0,0
