In [None]:
import logging
from fastapi import FastAPI, responses
import pandas as pd
import uvicorn

import sys
import os
import plotly.express as px
import plotly.graph_objects as go

# Get the path to the main folder (CWD for the notebook)
main_folder = os.getcwd()

# Construct the path to the src directory
src_path = os.path.join(main_folder, "src")

# Add the src directory to the module search path if it's not already there
if src_path not in sys.path:
    sys.path.append(src_path)

# Verify the path is added
print("Updated sys.path:", sys.path)

from VariableTypeEnum import VariableTypeEnum
from VarsManager import VarsManager
from src.ChannelAnalysis import ChannelAnalysis
from LoreInfo import regions, bot_users, lore_channels, human_countries
from src.Plotting import Plotting
from typing import Callable

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

vars_manager = VarsManager()
channel_names = vars_manager.vars[VariableTypeEnum.CHANNEL_NAMES]
user_names = vars_manager.vars[VariableTypeEnum.USER_NICKNAMES]
def get_channel_name(id):
    id = str(id)
    if(id in channel_names):
        return channel_names[id]
    else:
        return None

In [None]:
main_channel_analysis: ChannelAnalysis

if __name__ == "__main__":
    logging.info("Preparing data")
    main_channel_analysis = ChannelAnalysis(content_matters=True)
    logging.info("Starting the analysis")

In [None]:
channel_analysis: ChannelAnalysis = ChannelAnalysis(message_df=main_channel_analysis.messages_df)
channel_analysis.restrict_to_channels(lore_channels)

logging.getLogger().info(f"Plotting message count by day")
df: pd.DataFrame = channel_analysis.day_number_of_messages().copy()
creation_date_series_lore = df["created_at"].value_counts()
creation_date_series_lore = creation_date_series_lore.sort_index()

lore_channels_fig = px.histogram(creation_date_series_lore, 
                    x = creation_date_series_lore.index, 
                    y = "count",
                    title="Messages on lore channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['red'],
                    nbins=len(creation_date_series_lore))
lore_channels_fig.show()

In [None]:
all_channels: list[int] = None
channel_analysis: ChannelAnalysis = ChannelAnalysis(message_df=main_channel_analysis.messages_df)
channel_analysis.restrict_to_channels(all_channels)

logging.getLogger().info(f"Plotting message count by day")
df: pd.DataFrame = channel_analysis.day_number_of_messages().copy()
creation_date_series_all = df["created_at"].value_counts()
creation_date_series_all = creation_date_series_all.sort_index()

all_channels_fig = px.histogram(creation_date_series_all, 
                    x = creation_date_series_all.index, 
                    y = "count",
                    title="Messages on all channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    nbins=len(creation_date_series_all))
all_channels_fig.show()



In [None]:
#both figure
# Combine the histograms
fig = go.Figure()

creation_date_series_all_normalized = creation_date_series_all.copy()
creation_date_series_lore_normalized = creation_date_series_lore.copy()
creation_date_series_all_normalized  = creation_date_series_all_normalized / creation_date_series_all_normalized.max()
creation_date_series_lore_normalized = creation_date_series_lore_normalized / creation_date_series_lore_normalized.max()

all_channels_fig_probability = px.bar(creation_date_series_all_normalized, 
                    x = creation_date_series_all_normalized.index, 
                    y = "count",
                    title="Messages on all channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['blue'],
)

lore_channels_fig_probability = px.bar(creation_date_series_lore_normalized, 
                    x = creation_date_series_lore_normalized.index, 
                    y = "count",
                    title="Messages on lore channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['red']
)

# Add traces from both histograms
for trace in all_channels_fig_probability.data:
    trace.opacity = 1
    fig.add_trace(trace)
for trace in lore_channels_fig_probability.data:
    trace.opacity = 0.5
    fig.add_trace(trace)

fig.update_traces(marker_line_width=0)

# Update layout
fig.update_layout(
    title="Lore channels on top of all channels",
    xaxis_title="Date",
    yaxis_title="Messages to max ratio",
    barmode='overlay',
    template="plotly",
    bargap=0,
    bargroupgap=0
)

fig.show()

In [None]:
all_channels: list[int] = None
channel_analysis: ChannelAnalysis = ChannelAnalysis(message_df=main_channel_analysis.messages_df)
channel_analysis.restrict_to_channels(all_channels)

logging.getLogger().info(f"Plotting message count by day")
df: pd.DataFrame = channel_analysis.day_number_of_messages().copy()
creation_date_series_all_yearly = df["created_at"].value_counts()
creation_date_series_all_yearly = creation_date_series_all_yearly.sort_index()

creation_date_series_all_yearly.index = pd.to_datetime(creation_date_series_all_yearly.index).year

all_channels_fig = px.histogram(creation_date_series_all_yearly, 
                    x = creation_date_series_all_yearly.index, 
                    y = "count",
                    title="Messages on all channels yearly",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group")
all_channels_fig.show()


In [None]:
channel_analysis: ChannelAnalysis = ChannelAnalysis(message_df=main_channel_analysis.messages_df)
channel_analysis.restrict_to_channels(lore_channels)

logging.getLogger().info(f"Plotting message count by day")
df: pd.DataFrame = channel_analysis.day_number_of_messages().copy()
creation_date_series_lore_yearly = df["created_at"].value_counts()
creation_date_series_lore_yearly = creation_date_series_lore_yearly.sort_index()

creation_date_series_lore_yearly.index = pd.to_datetime(creation_date_series_lore_yearly.index).year

all_channels_fig = px.histogram(creation_date_series_lore_yearly, 
                    x = creation_date_series_lore_yearly.index, 
                    y = "count",
                    title="Messages on all channels yearly",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    color_discrete_sequence=['red'],
                    barmode="group")
all_channels_fig.show()


In [None]:
#both figure
# Combine the histograms
fig = go.Figure()

creation_date_series_all_normalized = creation_date_series_all_yearly.copy()
creation_date_series_lore_normalized = creation_date_series_lore_yearly.copy()
creation_date_series_all_normalized  = creation_date_series_all_normalized / creation_date_series_all_normalized.max()
creation_date_series_lore_normalized = creation_date_series_lore_normalized / creation_date_series_lore_normalized.max()

all_channels_fig_probability = px.bar(creation_date_series_all_normalized, 
                    x = creation_date_series_all_normalized.index, 
                    y = "count",
                    title="Messages on all channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['blue'],
)

lore_channels_fig_probability = px.bar(creation_date_series_lore_normalized, 
                    x = creation_date_series_lore_normalized.index, 
                    y = "count",
                    title="Messages on lore channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['red']
)

# Add traces from both histograms
for trace in all_channels_fig_probability.data:
    trace.opacity = 1
    fig.add_trace(trace)
for trace in lore_channels_fig_probability.data:
    trace.opacity = 0.5
    fig.add_trace(trace)

fig.update_traces(marker_line_width=0)

# Update layout
fig.update_layout(
    title="Lore channels on top of all channels yearly",
    xaxis_title="Date",
    yaxis_title="Messages to max ratio",
    barmode='overlay',
    template="plotly",
    bargap=0,
    bargroupgap=0
)

fig.show()

In [None]:
from urllib.request import urlopen
import json
with urlopen('file:///home/stefan/Downloads/features(61).geojson') as response:
    counties = json.load(response)

region_occurence_offtop_analysis = ChannelAnalysis.from_channel_analysis(main_channel_analysis)
region_occurence_offtop_analysis.remove_channels_from_current(lore_channels)
region_occurence_messages = region_occurence_offtop_analysis.copy_messages_df_current_channels()
region_occurence_messages = region_occurence_messages[~region_occurence_messages["author_id"].isin(bot_users)]

region_occurence_count: dict[str, list[str]] = {}
for region_id, region_values in regions.items():
  region_occurence_count[region_id] = 0
  for message_content in region_occurence_messages["content"]:
        message_content = message_content.lower()
        for region_value in region_values:
            region_occurence_count[region_id] += message_content.count(region_value)

import plotly.express as px

region_occurence_count = pd.DataFrame.from_dict(region_occurence_count, orient='index', columns=["occurence"])
fig = px.choropleth(region_occurence_count, geojson=counties, locations=region_occurence_count.index, color="occurence",
                           color_continuous_scale="Viridis",
                           labels={'unemp':'unemployment rate'},
                            featureidkey="properties.id",
                            title="Wystąpienia kraju (Kanały Lorowe)"
                          )
fig.update_geos(visible=False)
fig.update_layout(margin={"r":0,"t":100,"l":0,"b":0})
fig.show()


In [None]:
region_occurence_lore_analysis = ChannelAnalysis.from_channel_analysis(main_channel_analysis)
region_occurence_lore_analysis.restrict_to_channels(lore_channels)
region_occurence_messages = region_occurence_lore_analysis.copy_messages_df_current_channels()
region_occurence_messages = region_occurence_messages[~region_occurence_messages["author_id"].isin(bot_users)]

region_occurence_count: dict[str, list[str]] = {}
for region_id, region_values in regions.items():
  region_occurence_count[region_id] = 0
  for message_content in region_occurence_messages["content"]:
        message_content = message_content.lower()
        for region_value in region_values:
            region_occurence_count[region_id] += message_content.count(region_value)

import plotly.express as px

region_occurence_count = pd.DataFrame.from_dict(region_occurence_count, orient='index', columns=["occurence"])
fig = px.choropleth(region_occurence_count, geojson=counties, locations=region_occurence_count.index, color="occurence",
                           color_continuous_scale="Viridis",
                           labels={'unemp':'unemployment rate'},
                            featureidkey="properties.id",
                            title="Wystąpienia kraju (Pozostałe kanały)"
                          )
fig.update_geos(visible=False)
fig.update_layout(margin={"r":0,"t":150,"l":0,"b":0})
fig.show()

In [None]:
region_occurence_count_down = region_occurence_count[region_occurence_count["occurence"] < region_occurence_count["occurence"].quantile(0.70)]
fig = px.choropleth(region_occurence_count_down, geojson=counties, locations=region_occurence_count_down.index, color="occurence",
                           color_continuous_scale="Viridis",
                           labels={'unemp':'unemployment rate'},
                            featureidkey="properties.id"
                          )
fig.update_geos(visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
region_occurence_count_non_human = region_occurence_count[~region_occurence_count.index.isin(human_countries)]
fig = px.choropleth(region_occurence_count_non_human, geojson=counties, locations=region_occurence_count_non_human.index, color="occurence",
                           color_continuous_scale="Viridis",
                           labels={'unemp':'unemployment rate'},
                            featureidkey="properties.id",
                            title="Wystąpienia krajów NPC"
                          )
fig.update_geos(visible=False)
fig.update_layout(margin={"r":0,"t":150,"l":0,"b":0})
fig.show()

In [None]:
#channel words to lore phrases
import datetime


words_to_lore_phrases = main_channel_analysis.messages_df.copy()
#words_to_lore_phrases = words_to_lore_phrases.set_index("id")
words_to_lore_phrases = words_to_lore_phrases[~words_to_lore_phrases["author_id"].isin(bot_users)]

words_to_lore_phrases["created_at"] = pd.to_datetime(words_to_lore_phrases["created_at"], utc=True, format="%Y-%m-%d %H:%M:%S.%f%z", errors='coerce').fillna(
    pd.to_datetime(words_to_lore_phrases["created_at"], utc=True, format="%Y-%m-%d %H:%M:%S%z", errors='coerce')
)

words_to_lore_phrases["words"] = words_to_lore_phrases["content"].apply(lambda x: len(x.split()))

def lore_occurences_in_message(message_content: str) -> int:
    result = 0
    for region_id, region_values in regions.items():
        message_content = message_content.lower()
        for region_value in region_values:
            result += message_content.count(region_value)
    return result
words_to_lore_phrases["lore_phrases"] = words_to_lore_phrases["content"].apply(lambda x: lore_occurences_in_message(x))
def number_of_attachments(x: list | None):
    if(x is None):
        return 0
    else:
        return len(x)
words_to_lore_phrases["attachments_count"] = words_to_lore_phrases["attachments"].apply(lambda x: number_of_attachments(x))
words_to_lore_phrases["message_length"] = words_to_lore_phrases["content"].apply(lambda x: len(x))
def test(x):
    print(x)
    print(x.rolling("30min", center=True, on="created_at")["created_at"].count())
    return x.rolling("30min", center=True, on="created_at")["created_at"].count()
b = words_to_lore_phrases.groupby('channel_id').apply(lambda group: test(group))
b = b.reset_index(level=0)
words_to_lore_phrases["followup_messages"] = b["created_at"]
words_to_lore_phrases = words_to_lore_phrases.groupby("channel_id").agg(
    total_words=("words", 'sum'),
    total_lore_phrases=("lore_phrases", 'sum'),
    total_attachments=("attachments_count", "sum"),
    avg_message_length=("message_length", "median"),
    total_followup_messages=("followup_messages", "sum")
)
words_to_lore_phrases = words_to_lore_phrases[words_to_lore_phrases["total_words"] >= 200]
words_to_lore_phrases["channel_name"] = words_to_lore_phrases.index.map(get_channel_name)
words_to_lore_phrases["lore_to_words_ratio"] = words_to_lore_phrases["total_lore_phrases"]/words_to_lore_phrases["total_words"]
words_to_lore_phrases["attachments_lore_to_words_ratio"] = words_to_lore_phrases["lore_to_words_ratio"]*words_to_lore_phrases["total_attachments"]
words_to_lore_phrases["word_length_altw"] = words_to_lore_phrases["attachments_lore_to_words_ratio"]*words_to_lore_phrases["avg_message_length"]
words_to_lore_phrases
