In [1]:
# !pip install pandas
# !pip install skimpy
# !pip install matplotlib
# !pip install numpy

In [1]:
import pandas as pd

In [4]:
df = pd.read_pickle("./data/raw/events.pkl")

In [5]:
df

Unnamed: 0,createdAt,user_code,poll_code,event,country,city_code,gender,age,college_code
0,2022-12-29 18:30:10,user_2847,poll_27,Impression,country_1,city_1,,,
1,2022-12-29 18:30:28,user_2847,poll_220,Impression,country_1,city_1,,,
2,2022-12-29 18:30:38,user_2847,poll_190,Impression,country_1,city_1,,,
3,2022-12-29 18:30:50,user_2847,poll_153,Impression,country_1,city_1,,,
4,2022-12-29 18:31:34,user_2847,poll_42,Impression,country_1,city_1,,,
...,...,...,...,...,...,...,...,...,...
232724,2023-01-17 14:18:13,user_131,poll_706,Polls Answered,country_1,city_11,male,26.0,Other
232725,2022-12-31 22:14:53,user_55,poll_707,Polls Answered,,,,,
232726,2022-12-31 21:49:54,user_57,poll_707,Polls Answered,,,,,
232727,2022-12-31 21:39:52,user_71,poll_707,Polls Answered,,,,,


## Events clean

In [2]:
# ---
# jupyter:
#   jupytext:
#     cell_metadata_filter: -all
#     custom_cell_magics: kql
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.11.2
#   kernelspec:
#     display_name: hunch_assignment
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Setup

# %%
import datetime
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from skimpy import skim
# %%
from IPython.display import display

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
# %% [markdown]
# # Read raw data

# %%
data_path = os.path.join("data", "raw")
file_name = "events.pkl"
file_path = os.path.join(data_path, file_name)

# %%
events_raw: pd.DataFrame = None  # type: ignore
if events_raw is None:
    events_raw = pd.read_pickle(file_path)

# %%
display(events_raw.head())
display(events_raw.info())

# %%
skim(events_raw.apply(lambda x: x.astype("category") if x.dtype == "object" else x))

# %% [markdown]
# # Make a backup

# %%
events = events_raw.copy()

# %% [markdown]
# # Clean

# %% [markdown]
# ## Handling potential duplicates

# %%
temp = (
    events.groupby(["user_code", "poll_code", "event", "createdAt"])
    .filter(lambda x: x.shape[0] > 1)
    .sort_values(by=["event", "user_code", "poll_code", "createdAt"])
)

# %%
print(f"No. of potential duplicates: {temp.shape[0]}")
for col in ["user_code", "poll_code", "event"]:
    print(f"\n{col}: {temp[col].value_counts(dropna=False, normalize=True).iloc[:5]}")
display(temp.groupby(["event"]).head())

# %% [markdown]
# Different options for some reason! Does not make sense. Let's drop them.

# %%
primary_key = ["user_code", "poll_code", "event", "createdAt"]

events.drop_duplicates(subset=primary_key, ignore_index=True, inplace=True)

assert events.shape[0] == events.groupby(primary_key).ngroups

# %% [markdown]
# ## Handle rows with same user, poll, and event

# %%
events.sort_values(["createdAt"], inplace=True)

is_same_event = events.duplicated(subset=["user_code", "poll_code", "event"], keep="last")

print(
    f"No. of interactions to be removed: {is_same_event.sum()} ({is_same_event.sum()/events.shape[0]*100:.2f}%)"
)

# %%
events = events[~is_same_event].copy().reset_index(drop=True)

# %% [markdown]
# # Write

# %%
print(events.shape)
display(events.head())

# %%
data_path = os.path.join("data", "prepared")
file_name = "events.pkl"
file_path = os.path.join(data_path, file_name)

# %%
pd.to_pickle(events, file_path)

Unnamed: 0,createdAt,user_code,poll_code,event,country,city_code,gender,age,college_code
0,2022-12-29 18:30:10,user_2847,poll_27,Impression,country_1,city_1,,,
1,2022-12-29 18:30:28,user_2847,poll_220,Impression,country_1,city_1,,,
2,2022-12-29 18:30:38,user_2847,poll_190,Impression,country_1,city_1,,,
3,2022-12-29 18:30:50,user_2847,poll_153,Impression,country_1,city_1,,,
4,2022-12-29 18:31:34,user_2847,poll_42,Impression,country_1,city_1,,,


<class 'pandas.core.frame.DataFrame'>
Index: 231649 entries, 0 to 232728
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   createdAt     231649 non-null  datetime64[ns]
 1   user_code     231649 non-null  object        
 2   poll_code     231649 non-null  object        
 3   event         231649 non-null  object        
 4   country       219812 non-null  object        
 5   city_code     218467 non-null  object        
 6   gender        168868 non-null  object        
 7   age           162288 non-null  float64       
 8   college_code  101878 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 17.7+ MB


None

No. of potential duplicates: 217

user_code: user_code
user_212     0.737327
user_1797    0.013825
user_136     0.009217
user_3359    0.009217
user_4067    0.009217
Name: proportion, dtype: float64

poll_code: poll_code
poll_620    0.064516
poll_549    0.036866
poll_443    0.036866
poll_96     0.027650
poll_277    0.027650
Name: proportion, dtype: float64

event: event
Impression        0.580645
Polls Answered    0.313364
Expand            0.055300
Shares            0.050691
Name: proportion, dtype: float64


Unnamed: 0,createdAt,user_code,poll_code,event,country,city_code,gender,age,college_code
114877,2023-01-26 11:21:08,user_136,poll_620,Expand,country_1,city_1,male,32.0,Ramjas College
114878,2023-01-26 11:21:08,user_136,poll_620,Expand,country_1,city_1,male,32.0,Ramjas College
59793,2023-01-28 13:02:16,user_209,poll_449,Expand,country_1,city_1,male,23.0,St.Stephen's College
59794,2023-01-28 13:02:16,user_209,poll_449,Expand,country_1,city_1,male,23.0,St.Stephen's College
109321,2023-01-23 15:54:26,user_212,poll_549,Expand,country_1,city_1,,,
105939,2022-12-30 15:33:22,user_1234,poll_76,Impression,country_1,city_1,,,
105940,2022-12-30 15:33:22,user_1234,poll_76,Impression,country_1,city_1,,,
115500,2023-01-13 15:13:07,user_127,poll_583,Impression,country_1,city_1,male,28.0,Other
115501,2023-01-13 15:13:07,user_127,poll_583,Impression,country_1,city_1,male,28.0,Other
100718,2023-01-28 12:13:23,user_159,poll_67,Impression,country_1,city_10,male,13.0,Other


No. of interactions to be removed: 13712 (5.92%)
(217827, 9)


Unnamed: 0,createdAt,user_code,poll_code,event,country,city_code,gender,age,college_code
0,2022-12-19 16:30:24,user_50,poll_47,Polls Answered,,,,,
1,2022-12-19 16:30:25,user_21,poll_47,Polls Answered,,,,,
2,2022-12-19 16:30:26,user_10,poll_47,Polls Answered,,,,,
3,2022-12-19 16:30:27,user_6,poll_47,Polls Answered,,,,,
4,2022-12-19 16:30:27,user_70,poll_47,Polls Answered,,,,,


## Raw events to interactions

In [4]:
# !pip install pandarallel
# !pip install plotly
# !pip install fuzzywuzzy
# !pip install pickle

In [None]:
# ---
# jupyter:
#   jupytext:
#     cell_metadata_filter: -all
#     custom_cell_magics: kql
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.11.2
#   kernelspec:
#     display_name: hunch_assignment
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Setup

# %%
import datetime
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from skimpy import skim
import plotly.express as px
import seaborn as sns
from box import Box
from fuzzywuzzy import fuzz, process
from collections import defaultdict
import pickle
from itertools import product
from pandarallel import pandarallel

from mlops.utils import get_polls_data_from_interaction_data, get_users_data_from_interaction_data


# %%
from IPython.display import display


pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# %%
pandarallel.initialize()

# %% [markdown]
# # Read data

# %%
data_path = os.path.join("data", "prepared")
file_name = "events.pkl"
file_path = os.path.join(data_path, file_name)

# %%
events: pd.DataFrame = None  # type: ignore
if events is None:
    events = pd.read_pickle(
        file_path,
    )

# %%
display(events.head())

# %%
skim(events.apply(lambda x: x.astype("category") if x.dtype == "object" else x))


# %% [markdown]
# # Assign score per event

# %%
event_score_dict = {"Impression": 0, "Expand": 1, "Polls Answered": 2, "Shares": 3}
events["event_score"] = events["event"].map(event_score_dict)


# %% [markdown]
# # Get Users data

# %%
users = get_users_data_from_interaction_data(events.copy())

# %%
display(users.head())

# %%
temp = users["n_interactive_polls"].value_counts(sort=False).sort_index()
print(
    f"""Users with no interactions, just impressions: {temp[0]} ({(temp[0] / users.shape[0] * 100):.2f}%)"""
)

temp = users["n_polls"].value_counts(sort=False).sort_index()
print(f"""Users with just 1 poll: {temp[1]} ({(temp[1] / users.shape[0] * 100):.2f}%)""")


print(
    f"""Users with no useful location data: {users["has_no_useful_location_data"] .sum()} ({(users["has_no_useful_location_data"] .sum() / users.shape[0] * 100):.2f}%)"""
)


print(
    f"""Users with no useful identity data: {users["has_no_useful_identity_data"].sum()} ({(users["has_no_useful_identity_data"].sum() / users.shape[0] * 100):.2f}%)"""
)

print(
    f"""Users with no useful user data: {users["has_no_useful_user_data"].sum()} ({(users["has_no_useful_user_data"].sum() / users.shape[0] * 100):.2f}%)"""
)

# %%
skim(users.apply(lambda x: x.astype("category") if x.dtype == "object" else x))

# %%
for col in ["country", "city_code", "gender", "college_code"]:
    counts = users[col].value_counts(dropna=True, normalize=True).reset_index()
    counts["proportion_cumulative"] = counts["proportion"].cumsum().div(counts["proportion"].sum())
    index = (
        counts.loc[counts["proportion_cumulative"] > 0.9, "proportion_cumulative"].idxmin()
    ) + 1
    index = max(index, 5)
    print(f"\n{counts.iloc[0:index, 0:2]}")

# %%
for col in [
    "age",
    "n_polls",
    "n_interactive_polls_proportion",
    "event_score_by_user_per_interactive_poll",
]:
    display(
        users[col].describe(
            percentiles=np.concatenate(
                [
                    np.arange(0.01, 0.06, 0.01),
                    [0.1],
                    np.arange(0.25, 0.8, 0.25),
                    [0.9],
                    np.arange(0.95, 0.99, 0.01),
                ]
            )
        )
    )

# %% [markdown]
# ### Binning Age

# %%
is_younger_than_teen = users["age"] < 13
is_older_than_40 = users["age"] > 40

is_invalid_age = is_younger_than_teen | is_older_than_40

users["age"] = users["age"].where(~is_invalid_age)

# %%
users["age"].describe(
    percentiles=np.concatenate(
        [
            np.arange(0.01, 0.06, 0.01),
            [0.1],
            np.arange(0.25, 0.8, 0.25),
            [0.9],
            np.arange(0.95, 0.99, 0.01),
        ]
    )
)

# %%
bins = [0, 16, 18, 22, 25, 30, 40]
labels = [f"({bins[i]}-{bins[i+1]}]" for i in range(len(bins) - 1)]
print(labels)
users["age_binned"] = pd.cut(
    users["age"], bins=bins, labels=labels, right=True, include_lowest=False
).astype("object")

# %%
users["age_binned"].value_counts(dropna=True, normalize=True)

# %% [markdown]
# ### Reduce no. of city and college codes

# %% [markdown]
# Note: College already has "Other"

# %%
min_code_proportion = 0.01
for col in ["city_code", "college_code"]:
    code_propoptions = users[col].value_counts(dropna=True, normalize=True)
    codes_to_replace = code_propoptions[code_propoptions < min_code_proportion].index
    users[col + "_trimmed"] = users[col].replace(codes_to_replace, "Other").copy()
    print(users[col + "_trimmed"].value_counts(dropna=True, normalize=True))

# %% [markdown]
# ### Fill missing

# %%
users["country_filled"] = users["country"].copy()
users["gender_filled"] = users["gender"].copy()

# %%
for col in [
    "country_filled",
    "gender_filled",
    "age_binned",
    "city_code_trimmed",
    "college_code_trimmed",
]:
    users[col].fillna("Missing", inplace=True)


# %% [markdown]
# # Get Polls data

# %%
polls = get_polls_data_from_interaction_data(events.copy())

# %%
display(polls.head())

# %%
skim(polls.apply(lambda x: x.astype("category") if x.dtype == "object" else x))

# %%
for col in [
    "n_users",
    "n_interactive_users_proportion",
    "event_score_by_poll_per_interactive_user",
]:
    display(
        polls[col].describe(
            percentiles=np.concatenate(
                [
                    np.arange(0.01, 0.06, 0.01),
                    [0.1],
                    np.arange(0.25, 0.8, 0.25),
                    [0.9],
                    np.arange(0.95, 0.99, 0.01),
                ]
            )
        )
    )

# %% [markdown]
# ## Collapse event types

# %%
events.sort_values(["event_score"], inplace=True)

is_same_user_poll = events.duplicated(subset=["user_code", "poll_code"], keep="last")

print(
    f"No. of different events for same user-poll to be collapsed: {is_same_user_poll.sum()} ({is_same_user_poll.sum()/events.shape[0]*100:.2f}%)"
)

# %%
interactions = events[~is_same_user_poll].copy().reset_index(drop=True)
assert interactions.shape[0] == interactions.groupby(["poll_code", "user_code"]).ngroups
primary_key = ["user_code", "poll_code"]

# %%
skim(interactions.apply(lambda x: x.astype("category") if x.dtype == "object" else x))

# %% [markdown]
# # Write

# %%
pd.to_pickle(interactions, os.path.join(data_path, "interactions.pkl"))
pd.to_pickle(users, os.path.join(data_path, "users.pkl"))
pd.to_pickle(polls, os.path.join(data_path, "polls.pkl"))

## Prepare for model

In [None]:
# ---
# jupyter:
#   jupytext:
#     cell_metadata_filter: -all
#     custom_cell_magics: kql
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.11.2
#   kernelspec:
#     display_name: hunch_assignment
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Setup

# %%
import os

import pandas as pd
from skimpy import skim


# %%
from IPython.display import display
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# %% [markdown]
# # Read data

# %%
data_path = os.path.join("data", "prepared")

# %%
file_name = "interactions.pkl"
file_path = os.path.join(data_path, file_name)
interactions: pd.DataFrame = None # type: ignore
if interactions is None:
    interactions = pd.read_pickle(
        file_path,
    )
assert interactions.shape[0] == interactions.groupby(['user_code', 'poll_code']).ngroups
display(interactions.head())    

# %%
file_name = "users.pkl"
file_path = os.path.join(data_path, file_name)
users: pd.DataFrame = None # type: ignore
if users is None:
    users = pd.read_pickle(
        file_path,
    )
assert users.shape[0] == users["user_code"].nunique()    
display(users.head())   

# %%
file_name = "polls.pkl"
file_path = os.path.join(data_path, file_name)
polls: pd.DataFrame = None # type: ignore
if polls is None:
    polls = pd.read_pickle(
        file_path,
    )
assert polls.shape[0] == polls["poll_code"].nunique()    
display(polls.head())   


# %% [markdown]
# # Delete irrelevant users and polls

# %% [markdown]
# Users with 0% interactive polls are not relevant for our purpose because we can neither learn anything from them, nor use them in the test set. Same goes for polls with 0% interactive users. So, let's delete them.

# %%
users_with_zero_interactive_polls = users.loc[
    users["has_no_interactive_polls"], "user_code"
].values
print(
    f"Users with zero interactive polls: {len(users_with_zero_interactive_polls)} ({len(users_with_zero_interactive_polls)/users.shape[0]*100:.2f}%)"
)

polls_with_zero_interactive_users = polls.loc[
    polls["has_no_interactive_users"], "poll_code"
].values
print(
    f"Polls with zero interactive users: {len(polls_with_zero_interactive_users)} ({len(polls_with_zero_interactive_users)/polls.shape[0]*100:.2f}%)"
)

# %%
users = users[~users["has_no_interactive_polls"]].copy().reset_index(drop=True)
users.drop(columns=["has_no_interactive_polls"], inplace=True)

assert users.shape[0] == users["user_code"].nunique()

polls = polls[~polls["has_no_interactive_users"]].copy().reset_index(drop=True)
polls.drop(columns=["has_no_interactive_users"], inplace=True)

assert polls.shape[0] == polls["poll_code"].nunique()

n_polls = polls.shape[0]
n_users = users.shape[0]

print(f"n_users: {n_users}")
print(f"n_polls: {n_polls}")

# %%
is_non_interactive_user = interactions["user_code"].isin(users_with_zero_interactive_polls)
is_non_interactive_poll = interactions["poll_code"].isin(polls_with_zero_interactive_users)

is_non_interactive = is_non_interactive_user | is_non_interactive_poll

print(
    f"User-poll interactions to be removed: {is_non_interactive.sum()} ({is_non_interactive.sum()/interactions.shape[0]*100:.2f}%)"
)

# %%
rows_before = interactions.shape[0]
interactions = interactions[~is_non_interactive].copy().reset_index(drop=True)
rows_after = interactions.shape[0]
assert rows_before - rows_after == is_non_interactive.sum()

# %%
assert set(set(interactions["user_code"].unique())).issubset(set(users["user_code"]))
assert set(set(users["user_code"])).issubset(set(interactions["user_code"].unique()))

assert set(set(interactions["poll_code"].unique())).issubset(set(polls["poll_code"]))
assert set(set(polls["poll_code"])).issubset(set(interactions["poll_code"].unique()))

# %% [markdown]
# ## Concise Summary:

# %%
print(f"Total number of unique user-poll interactions: {interactions.shape[0]}")
print(
    f"""\nDistribution by event type:\n{interactions["event"].value_counts(dropna=False, normalize=True)}"""
)
n_polls = interactions["poll_code"].nunique()
n_users = interactions["user_code"].nunique()
print(f"\nn_users: {n_users}")
print(f"n_polls: {n_polls}")
skim(
    interactions.apply(lambda x: x.astype("category") if x.dtype == "object" else x)
)

# %% [markdown]
# # Write

# %%
pd.to_pickle(interactions, os.path.join(data_path, "interactions_relevant.pkl"))
pd.to_pickle(users, os.path.join(data_path, "users_relevant.pkl"))
pd.to_pickle(polls, os.path.join(data_path, "polls_relevant.pkl"))

## Model Experiments

In [None]:
# ---
# jupyter:
#   jupytext:
#     cell_metadata_filter: -all
#     custom_cell_magics: kql
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.11.2
#   kernelspec:
#     display_name: hunch_assignment
#     language: python
#     name: python3
# ---

# %% [markdown]
# # Setup

# %%
import os

import numpy as np
import pandas as pd
from skimpy import skim
from collections import defaultdict
from pandarallel import pandarallel

from mlops.evaluate import convert_df_to_dict, eval_add_show
from mlops.utils import get_polls_data_from_interaction_data, get_users_data_from_interaction_data


# %%
from IPython.display import display
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# %%
pandarallel.initialize()

# %% [markdown]
# # Read data

# %%
data_path = os.path.join("data", "prepared")

# %%
file_name = "interactions_relevant.pkl"
file_path = os.path.join(data_path, file_name)
interactions: pd.DataFrame = None # type: ignore
if interactions is None:
    interactions = pd.read_pickle(
        file_path,
    )
assert interactions.shape[0] == interactions.groupby(['user_code', 'poll_code']).ngroups
display(interactions.head())    

# %%
file_name = "users_relevant.pkl"
file_path = os.path.join(data_path, file_name)
users: pd.DataFrame = None # type: ignore
if users is None:
    users = pd.read_pickle(
        file_path,
    )
assert users.shape[0] == users["user_code"].nunique()    
display(users.head())   

# %%
file_name = "polls_relevant.pkl"
file_path = os.path.join(data_path, file_name)
polls: pd.DataFrame = None # type: ignore
if polls is None:
    polls = pd.read_pickle(
        file_path,
    )
assert polls.shape[0] == polls["poll_code"].nunique()    
display(polls.head())   


# %% [markdown]
# # Train test split

# %%
print("User data split:")
display(
    pd.crosstab(
        users["has_just_one_poll"],
        users["has_no_useful_identity_data"],
        rownames=["is new user"],
        colnames=["no identity data"],
        margins=True,
        normalize=True,
    ).apply(lambda x: (x * 100).round(2))
)

# %%
users_has_columns = users.columns[users.columns.str.contains("has_")].tolist()

if not all([x in interactions.columns for x in users_has_columns]):
    rows_before = interactions.shape[0]    
    interactions = interactions.merge(
        users[
            ["user_code", *users_has_columns]
        ],
        on="user_code",
        how="left",
    )
    rows_after = interactions.shape[0]
    assert rows_before == rows_after

group_name_dict = {
    (True, True): ("new", "no_identity_data"),
    (True, False): ("new", "with_identity_data"),
    (False, True): ("existing", "no_identity_data"),
    (False, False): ("existing", "with_identity_data"),
}

grouped_data = (
    interactions
    .groupby(["has_just_one_poll", "has_no_useful_identity_data"])
)

grouped_data_dict = {
    group_name_dict[group_name]: group_data for group_name, group_data in grouped_data
}

# %%
train_data_dict = {}
test_data_dict = {}

for key, value in grouped_data_dict.items():
    # value = value.sort_values(["createdAt"]).copy()
    if key[0] == "new":
        test_indexes = (
            value[value["event"] != "Impression"]   # because we want only interactive polls in test
            .groupby(["poll_code"])    
            .filter(lambda x: len(x) > 1)   # if the poll has just this one user, then that poll should be present in train for it to be available for recommendation
            .groupby(["poll_code"])
            .apply(lambda x: x.sample(frac=0.1, random_state=123))
            .reset_index(level=[0], drop=True)
            .index
        )
    else:
        test_indexes = (
            value[value["event"] != "Impression"]   # because we want only interactive polls in test
            .groupby(["poll_code"])
            .filter(lambda x: len(x) > 2)  # to ensure that test polls are in train too
            .sample(frac=0.2, random_state=123)
            .index
        )
    test_data_dict[key] = value.loc[test_indexes]
    train_data_dict[key] = value.drop(test_indexes)

# %%
train_data = pd.concat(train_data_dict.values(), keys=train_data_dict.keys()).reset_index(drop=True)
train_users = get_users_data_from_interaction_data(train_data.copy())
train_polls = get_polls_data_from_interaction_data(train_data.copy())

assert set(train_data["user_code"].unique()).issubset(set(train_users["user_code"]))
assert set(train_users["user_code"]).issubset(train_data["user_code"].unique())
assert set(train_data["poll_code"].unique()).issubset(set(train_polls["poll_code"]))
assert set(train_polls["poll_code"]).issubset(train_data["poll_code"].unique())

print("train:")
print(f"{train_data.shape}")
print(f"""Users in train data: {len(train_data["user_code"].unique())}""")
print(f"""Polls in train data: {len(train_data["poll_code"].unique())}""")
print("User-poll interaction split:")
display(
    pd.crosstab(
        train_data["has_just_one_poll"],
        train_data["has_no_useful_identity_data"],
        rownames=["is new user"],
        colnames=["no identity data"],
        margins=True,
        normalize=True,
    ).apply(lambda x: (x * 100).round(2))
)

test_data = pd.concat(test_data_dict.values(), keys=test_data_dict.keys()).reset_index(drop=True)
test_users = get_users_data_from_interaction_data(test_data.copy())
test_polls = get_polls_data_from_interaction_data(test_data.copy())

assert set(test_data["user_code"].unique()).issubset(set(test_users["user_code"]))
assert set(test_users["user_code"]).issubset(test_data["user_code"].unique())
assert set(test_data["poll_code"].unique()).issubset(set(test_polls["poll_code"]))
assert set(test_polls["poll_code"]).issubset(test_data["poll_code"].unique())

print("\ntest:")
print(f"{test_data.shape}")
print(f"""Users in test data: {len(test_data["user_code"].unique())}""")
print(f"""Polls in test data: {len(test_data["poll_code"].unique())}""")
print("User-poll interaction split:")
display(
    pd.crosstab(
        test_data["has_just_one_poll"],
        test_data["has_no_useful_identity_data"],
        rownames=["is new user"],
        colnames=["no identity data"],
        margins=True,
        normalize=True,
    ).apply(lambda x: (x * 100).round(2))
)

print(
    f"""\nTest users in train: {np.isin(test_data["user_code"].unique(), train_data["user_code"].unique()).sum() / len(test_data["user_code"].unique())* 100:.2f}%"""
)
print(
    f"""Test polls in train: {np.isin(test_data["poll_code"].unique(), train_data["poll_code"].unique()).sum() / len(test_data["poll_code"].unique())* 100:.2f}%"""
)


assert train_data.shape[0] + test_data.shape[0] == interactions.shape[0]

# %%
print("Distribution of polls per user in train")
display(train_users["n_interactive_polls"].describe(np.arange(0.1, 1, 0.1)).to_frame().T)

print("Distribution of polls per user in test")
display(test_users["n_interactive_polls"].describe(np.arange(0.1, 1, 0.1)).to_frame().T)

# %%
assert interactions.shape[0] == interactions.groupby(["poll_code", "user_code"]).ngroups
assert users.shape[0] == users["user_code"].nunique()
assert polls.shape[0] == polls["poll_code"].nunique()

assert set(interactions["user_code"].unique()).issubset(set(users["user_code"]))
assert set(users["user_code"]).issubset(interactions["user_code"].unique())
all_users = users["user_code"]

assert set(interactions["poll_code"].unique()).issubset(set(polls["poll_code"]))
assert set(polls["poll_code"]).issubset(interactions["poll_code"].unique())
all_polls = polls["poll_code"]

# %%
assert train_data.shape[0] == train_data.groupby(["poll_code", "user_code"]).ngroups
assert train_users.shape[0] == train_users["user_code"].nunique()
assert train_polls.shape[0] == train_polls["poll_code"].nunique()

assert set(train_data["user_code"].unique()).issubset(set(train_users["user_code"]))
assert set(train_users["user_code"]).issubset(train_data["user_code"].unique())
train_users_users = train_users["user_code"]


assert set(train_data["poll_code"].unique()).issubset(set(train_polls["poll_code"]))
assert set(train_polls["poll_code"]).issubset(train_data["poll_code"].unique())
train_polls_polls = train_polls["poll_code"]

# %%
assert test_data.shape[0] == test_data.groupby(["poll_code", "user_code"]).ngroups
assert test_users.shape[0] == test_users["user_code"].nunique()
assert test_polls.shape[0] == test_polls["poll_code"].nunique()

assert set(test_data["user_code"].unique()).issubset(set(test_users["user_code"]))
assert set(test_users["user_code"]).issubset(test_data["user_code"].unique())
test_users_users = test_users["user_code"]


assert set(test_data["poll_code"].unique()).issubset(set(test_polls["poll_code"]))
assert set(test_polls["poll_code"]).issubset(test_data["poll_code"].unique())
test_polls_polls = test_polls["poll_code"]

# %% [markdown]
# # Data prep. for model

# %%
print(f"Total train users: {len(train_users)}")
print(f"Total train polls: {len(train_polls)}")

# get just those train polls that the user interacted with
train_data_i = train_data[train_data["event_score"] != 0].copy()
train_users_with_interactions = train_data_i["user_code"].unique()
train_polls_with_interactions = train_data_i["poll_code"].unique()

print(f"train users with interactions: {len(train_users_with_interactions)}")
print(f"train polls with interactions: {len(train_polls_with_interactions)}")

# %%
print(f"Total test users: {len(test_users)}")
print(f"Total test polls: {len(test_polls)}")

# convert it to dict format
test_data_dict = convert_df_to_dict(
    test_data[["user_code", "poll_code", "event_score"]].copy(), with_pred_rating=True
)

# %%
train_poll_codes_by_user = train_data.groupby("user_code")["poll_code"].agg(list).reset_index()
train_poll_codes_by_user.rename(columns={"poll_code": "train_poll_codes_list"}, inplace=True)

rows_before = test_data.shape[0]

test_data = test_data.merge(train_poll_codes_by_user, on="user_code", how="left")
test_data["train_poll_codes_list"] = test_data["train_poll_codes_list"].apply(
    lambda d: d if isinstance(d, list) else []
)
rows_after = test_data.shape[0]

assert rows_before == rows_after

# %%
test_users_in_train = test_users[np.isin(test_users, train_users)].copy()

test_data_in_train = (
    test_data[test_data["user_code"].isin(test_users_in_train)].copy().reset_index(drop=True)
)

test_data_in_train_dict = (
    test_data_in_train.groupby("user_code")[["poll_code", "event_score"]]
    .apply(lambda g: list(map(tuple, g.values)))
    .to_dict()
)

for user_code, recommendation in test_data_in_train_dict.items():
    recommendation = dict(sorted(recommendation.items(), key=lambda x: x[1], reverse=True))
    test_data_in_train_dict[user_code] = recommendation

# %% [markdown]
# # Modeling

# %%
model_results_comparison = pd.DataFrame()

# %% [markdown]
# ## Basline-0: same top popular polls to every user

# %%
test_polls_set = set(test_polls)

for popularity_metric in [
    "n_interactive_users",
    # "event_score_sum_by_poll",
    # "event_score_by_poll_per_interactive_user",
]:
    recommended_polls = (
        train_polls.sort_values(popularity_metric, ascending=False)["poll_code"].reset_index(
            drop=True
        )
        .copy()
        .to_list()
    )

    df_recommended = test_users[["user_code"]].copy()

    rows_before = df_recommended.shape[0]
    df_recommended = df_recommended.merge(
        test_data[["user_code", "train_poll_codes_list"]].drop_duplicates(["user_code"]),
        on="user_code",
        how="left",
    )
    rows_after = df_recommended.shape[0]
    assert rows_before == rows_after

    for n in [10, 25, 50]:
        model_name = f"Baseline: Top {n} polls by " + popularity_metric
        df_recommended["recommended_polls"] = [recommended_polls] * df_recommended.shape[0]
        df_recommended["recommended_polls_filtered"] = df_recommended.parallel_apply(
            lambda x: x["recommended_polls"][0:n],
            axis=1,
        ) # type: ignore

        # df_recommended["recommended_polls_filtered"] = df_recommended.parallel_apply(
        #     lambda x: [
        #         poll for poll in x["recommended_polls"] if poll not in x["train_poll_codes_list"]
        #     ][0:n],
        #     axis=1,
        # ) # type: ignore

        recommendation_dict = df_recommended.set_index("user_code")[
            "recommended_polls_filtered"
        ].to_dict()

        (
            ndcg_by_user,
            precision_by_user,
            recall_by_user,
            results,
            model_results_comparison,
        ) = eval_add_show(
            model_name,
            recommendation_dict,
            test_data_dict,
            all_polls,
            train_data[["user_code", "poll_code", "event_score"]].copy(),
            with_pred_rating=False,
            model_results_comparison=model_results_comparison.copy(),
            add=True,
            show=False,
        )
with pd.option_context("display.float_format", "{:,.2%}".format):
    display(model_results_comparison)

# %% [markdown]
# ## SVD from `surprise` package

# %%
from surprise import Dataset, Reader, SVD

# %%
rating_min = train_data["event_score"].min()
rating_max = train_data["event_score"].max()
print(f"Min rating: {rating_min}, Max rating: {rating_max}")
req_cols = ["user_code", "poll_code", "event_score"]

reader = Reader(rating_scale=(rating_min, rating_max))
train_data_surprise = Dataset.load_from_df(train_data[req_cols], reader).build_full_trainset()

# %%
algo = SVD(n_factors=10, n_epochs=40, lr_all=0.005, reg_all=0.1)
algo.fit(train_data_surprise)


# %%
def get_test_predictions(algo, test_data, all_polls, is_filter_train_polls_out=False, n=50):
    test_predictions_by_user = defaultdict(list)
    for user_code in test_data["user_code"].unique():
        train_poll_codes_list = test_data.loc[
            test_data["user_code"] == user_code, "train_poll_codes_list"
        ].values.tolist()[0]
        if is_filter_train_polls_out:
            candidate_polls = [poll for poll in all_polls if poll not in train_poll_codes_list]
        else:
            candidate_polls = all_polls
        for poll_code in candidate_polls:
            predicted_rating = algo.predict(user_code, poll_code).est
            test_predictions_by_user[user_code].append((poll_code, predicted_rating))

    for user_code, recommendation in test_predictions_by_user.items():
        recommendation.sort(key=lambda x: x[1], reverse=True)
        test_predictions_by_user[user_code] = recommendation[:n]

    return test_predictions_by_user


# %%
for is_filter_train_polls_out in [False, True]:
    for n in [10, 25, 50]:
        test_predictions_by_user_topn = get_test_predictions(
            algo, test_data, all_polls, is_filter_train_polls_out=is_filter_train_polls_out, n=n
        )
        model_name = f"SVD: {n} - has_train_polls {is_filter_train_polls_out}"
        ndcg_by_user, precision_by_user, recall_by_user, results, model_results_comparison = eval_add_show(
            model_name,
            test_predictions_by_user_topn,
            test_data_dict,
            all_polls,
            train_data[["user_code", "poll_code", "event_score"]].copy(),
            with_pred_rating=True,
            model_results_comparison=model_results_comparison.copy(),
            add=True,
            show=False,
        )

# %%
with pd.option_context("display.float_format", "{:,.2%}".format):
    display(model_results_comparison)

# %%