In [1]:
import pandas as pd
import plotly.graph_objects as go

In [2]:
# def replace_csv_separator(input_csv_path: str, output_csv_path: str):
#     """
#     Replace the separator in a CSV file from ',' to ';'.

#     :param input_csv_path: Path to the input CSV file with ',' separator.
#     :param output_csv_path: Path to the output CSV file with ';' separator.
#     """
#     # Read the input CSV file
#     df = pd.read_csv(input_csv_path)

#     # Write to a new CSV file with ';' as the separator
#     df.to_csv(output_csv_path, sep=";", index=False)

In [3]:
# Example usage
# replace_csv_separator("input.csv", "output.csv")

In [4]:
def get_sessions(
    interactions_path: str, eps: int, start_time: str = None, end_time: str = None
) -> pd.DataFrame:
    """
    Get a DataFrame with session IDs assigned to each interaction based on a time threshold.

    :param interactions_path: Path to the CSV file containing user interactions data.
    :param eps: The time difference (in seconds) threshold to define a new session.
    :param start_time: Optional. Start of the time interval as a string.
    :param end_time: Optional. End of the time interval as a string.
    :return: A DataFrame with a new column "session_id" indicating
             the session each interaction belongs to.
    """
    interactions_df = pd.read_csv(interactions_path, sep=";")

    if interactions_df.empty:
        print("No interactions data available.")
        return pd.DataFrame()

    interactions_df["time"] = pd.to_datetime(interactions_df["time"])

    # Filter by time range if start_time and end_time are provided
    if start_time:
        start_time = pd.to_datetime(start_time)
        interactions_df = interactions_df[interactions_df["time"] >= start_time]
    if end_time:
        end_time = pd.to_datetime(end_time)
        interactions_df = interactions_df[interactions_df["time"] <= end_time]

    interactions_df = interactions_df.sort_values(by=["user_id", "time"])

    # Calculate time differences between consecutive interactions for each user
    interactions_df["time_diff"] = (
        interactions_df.groupby("user_id")["time"].diff().dt.total_seconds()
    )

    # Identify new sessions based on the time difference threshold
    interactions_df["new_session"] = (interactions_df["time_diff"] > eps).fillna(True)

    # Assign a session ID for each interaction
    interactions_df["session_id"] = interactions_df.groupby("user_id")[
        "new_session"
    ].cumsum()

    return interactions_df

In [5]:
def compute_mean_ratios(
    interactions_path: str, eps: int, start_time: str = None, end_time: str = None
) -> float:
    """
    Compute the mean ratio of likes to total interactions per session.

    :param interactions_path: Path to the CSV file containing user interactions data.
    :param eps: The time difference (in seconds) threshold to define a new session.
    :param start_time: Optional. Start of the time interval as a string.
    :param end_time: Optional. End of the time interval as a string.
    :return: The mean ratio of likes (likes / (likes + dislikes)) as a float.
    """
    interactions_df = get_sessions(interactions_path, eps, start_time, end_time)

    if interactions_df.empty:
        print("No data available for the specified time range.")
        return 0

    # Calculate the like ratio per session for each user
    session_ratios = (
        interactions_df.groupby(["user_id", "session_id"])["interaction"]
        .mean()
        .reset_index(name="like_ratio")
    )
    # Calculate mean ratio per user
    user_mean_ratios = (
        session_ratios.groupby("user_id")["like_ratio"]
        .mean()
        .reset_index(name="user_mean_ratio")
    )
    global_mean_ratio = user_mean_ratios["user_mean_ratio"].mean()

    return global_mean_ratio

In [6]:
def plot_global_like_rate(
    interactions_path: str,
    delta: int,
    manual_timestamps: list = None,
    config: dict = None,
):
    """
    Plot the average like-rate over global sessions.

    :param interactions_path: Path to the CSV file containing interactions data.
    :param delta: Time gap (in seconds) to define a new global session.
    :param manual_timestamps: List of manual timestamps (as strings) to define additional session splits.
    :param config: Dictionary with configuration options:
                   - start_time: Start of the time range as a string (default: None)
                   - end_time: End of the time range as a string (default: None)
                   - save_path: File path to save the generated plot (default: None)
    """
    # Default configuration
    config = config or {}
    start_time = config.get("start_time")
    end_time = config.get("end_time")
    save_path = config.get("save_path")

    # Load the interactions data
    interactions = pd.read_csv(interactions_path, sep=";")
    interactions["time"] = pd.to_datetime(interactions["time"], errors="coerce")
    interactions = interactions.dropna(subset=["time"])

    # Convert start_time and end_time to datetime if provided
    if start_time:
        start_time = pd.to_datetime(start_time, errors="coerce")
        interactions = interactions[interactions["time"] >= start_time]
    if end_time:
        end_time = pd.to_datetime(end_time, errors="coerce")
        interactions = interactions[interactions["time"] <= end_time]

    if interactions.empty:
        print("No valid data available for the specified time range.")
        return

    # Sort interactions by time
    interactions = interactions.sort_values(by="time")

    # Convert manual_timestamps to datetime
    if manual_timestamps:
        manual_timestamps = sorted(pd.to_datetime(manual_timestamps, errors="coerce"))
    else:
        manual_timestamps = []

    # Define global sessions based on delta and manual timestamps
    session_id = 1
    session_ids = []
    last_time = None

    for idx, row in interactions.iterrows():
        if last_time is None:
            # Start of the first session
            session_ids.append(session_id)
        else:
            # Check for session split
            time_diff = (row["time"] - last_time).total_seconds()
            if time_diff > delta or any(
                row["time"] >= ts for ts in manual_timestamps if ts > last_time
            ):
                session_id += 1
            session_ids.append(session_id)
        last_time = row["time"]

    interactions["session_id"] = session_ids

    # Compute mean ratios for each
    session_like_rates = []
    session_labels = []
    session_interactions_counts = []

    for session_id in interactions["session_id"].unique():
        session_data = interactions[interactions["session_id"] == session_id]
        session_start = session_data["time"].min()
        session_end = session_data["time"].max()
        interaction_count = len(session_data)

        like_rate = compute_mean_ratios(
            interactions_path=interactions_path,
            eps=delta,
            start_time=str(session_start),
            end_time=str(session_end),
        )
        session_like_rates.append(like_rate)
        session_labels.append(
            f"{session_start.strftime('%b %d')}<br>{session_start.strftime('%H:%M')} - {session_end.strftime('%H:%M')}<br>{interaction_count} interactions"
        )
        session_interactions_counts.append(interaction_count)

    # Prepare the data for Plotly
    fig = go.Figure()

    # Add the line plot
    fig.add_trace(
        go.Scatter(
            x=list(range(1, len(session_like_rates) + 1)),
            y=session_like_rates,
            mode="lines+markers",
            marker=dict(color="blue", size=8),
            line=dict(color="blue", width=2),
            name=f"Global Like Rate (delta={delta // 3600}h)",
        )
    )

    # Update x-axis labels with session start and end times
    fig.update_layout(
        title="Like Rate Across Global Sessions",
        xaxis_title="Session ID (Start - End Times)",
        yaxis_title="Average Like Rate",
        template="plotly_white",
        legend=dict(title="Legend"),
        font=dict(size=12),
        xaxis=dict(
            tickmode="array",
            tickvals=list(range(1, len(session_like_rates) + 1)),
            ticktext=session_labels,
        ),
        width=800,  # Adjust width here to make the plot narrower
    )

    if save_path:
        fig.write_image(save_path)
        print(f"Graph saved at: {save_path}")
    else:
        fig.show()


# Example usage
# plot_global_like_rate("interactions.csv", delta=3600, manual_timestamps=["2024-12-01T12:00:00", "2024-12-01T18:00:00"], config={"save_path": "like_rate.png"})

### Like-rate graph with easter-eggs interactions (we did not clear easter-eggs from interactions). Cleared version is lower.

In [7]:
# Example usage
plot_global_like_rate(
    "../data/interactions5.csv",
    delta=6 * 60 * 60,
    manual_timestamps=[
        "2024-12-14T7:00:00",  # boot with LLM2 + TopPop ...
        "2024-12-14T9:00:34",
    ],
)

### Masking easter-eggs from interactions because they affect (negatively) on like-rate

In [8]:
def merge_item_cat2(
    interactions_df: pd.DataFrame,
    items_path: str,
) -> pd.DataFrame:
    """
    Merge items from interactions_df with their item_id from items file.
    """
    # Load data
    items_df = pd.read_csv(items_path, sep=",")

    # Ensure 'cat2' is mapped from items.csv where missing
    interactions_df = interactions_df.merge(
        items_df[["item_id", "cat2"]],
        on="item_id",
        how="left",
        suffixes=("", "_from_items"),
    )

    # Fill missing 'cat2' from items.csv
    if "cat2_from_items" in interactions_df.columns:
        interactions_df["cat2"] = interactions_df["cat2"].combine_first(
            interactions_df["cat2_from_items"]
        )
        interactions_df.drop(columns=["cat2_from_items"], inplace=True, errors="ignore")

    return interactions_df

In [9]:
def merge_item_cat2_without_sirius(
    interactions_df: pd.DataFrame,
    items_path: str,
) -> pd.DataFrame:
    """
    Merge items from interactions_df with their item_id from items file,
    and remove interactions where cat2 == "sirius".

    :param interactions_df: DataFrame with interactions data.
    :param items_path: Path to the items CSV file.
    :return: Filtered and merged DataFrame.
    """
    # Load data
    items_df = pd.read_csv(items_path, sep=",")

    # Ensure 'cat2' is mapped from items.csv where missing
    interactions_df = interactions_df.merge(
        items_df[["item_id", "cat2"]],
        on="item_id",
        how="left",
        suffixes=("", "_from_items"),
    )

    # Fill missing 'cat2' from items.csv
    if "cat2_from_items" in interactions_df.columns:
        interactions_df["cat2"] = interactions_df["cat2"].combine_first(
            interactions_df["cat2_from_items"]
        )
        interactions_df.drop(columns=["cat2_from_items"], inplace=True, errors="ignore")

    # Remove rows where cat2 == "sirius"
    interactions_df = interactions_df[interactions_df["cat2"] != "sirius"]

    return interactions_df


In [10]:
all_interactions = pd.read_csv("../data/interactions5.csv", sep=";")
all_interactions_no_eggs = merge_item_cat2_without_sirius(
    all_interactions, items_path="../data/items.csv"
)
no_eggs_filepath = "../data/interactions5_no_sirius.csv"
all_interactions_no_eggs.to_csv(no_eggs_filepath, sep=";")

### Like-rate graph without easter-eggs interactions (we cleared easter-eggs from interactions)

In [11]:
# Example usage
plot_global_like_rate(
    no_eggs_filepath,
    delta=6 * 60 * 60,
    manual_timestamps=[
        "2024-12-14T7:00:00",  # boot with LLM2 + TopPop ...
        "2024-12-14T9:00:34",  # boot with LLM2 + TopPop + dislikes + ...
    ],
)