# Data preparation

In [None]:
import sys
import os
# Handle imports in src
main_folder = os.getcwd()
src_path = os.path.join(main_folder, "src")
if src_path not in sys.path:
    sys.path.append(src_path)
import logging
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
from VariableTypeEnum import VariableTypeEnum
from VarsManager import VarsManager
from src.ChannelAnalysis import ChannelAnalysis
from LoreInfo import regions, bot_users, lore_channels, human_countries, ml_channels, dig_channels, other_lore_words, special_characters, emojis, log_channels, offtop_ml_channels

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

vars_manager = VarsManager()
channel_names = vars_manager.vars[VariableTypeEnum.CHANNEL_NAMES]
user_names = vars_manager.vars[VariableTypeEnum.USER_NICKNAMES]
def get_channel_name(id):
    id = str(id)
    if(id in channel_names):
        return channel_names[id]
    else:
        return None
def get_user_name(id):
    id = str(id)
    id = hash(id)
    if(id in user_names):
        return user_names[id]
    else:
        return str(id)
    
main_channel_analysis: ChannelAnalysis

if __name__ == "__main__":
    logging.info("Preparing data")
    main_channel_analysis = ChannelAnalysis(content_matters=True)
    logging.info("Starting the analysis")

In [None]:
plotly.offline.init_notebook_mode()

# Clean data and create numerical values for analysis

In [4]:
df: pd.DataFrame = main_channel_analysis.messages_df.copy()
from plotly.subplots import make_subplots

df = df[~df["channel_id"].isin(log_channels)]
df = df[(df["content"].str.len() > 2) | (df["attachments"].str.len() > 0)]
df = df[~df["author_id"].isin(bot_users)]


df["created_at"] = pd.to_datetime(df["created_at"], utc=True, format="%Y-%m-%d %H:%M:%S.%f%z", errors='coerce').fillna(
    pd.to_datetime(df["created_at"], utc=True, format="%Y-%m-%d %H:%M:%S%z", errors='coerce')
)

df["words"] = df["content"].apply(lambda x: len(x.split()))
df["message_length"] = df["content"].apply(lambda x: len(x))
def number_of_attachments(x: list | None):
    if(x is None):
        return 0
    else:
        return len(x)
df["attachments_count"] = df["attachments"].apply(lambda x: number_of_attachments(x))
def special_characters_number(message_content: str) -> int:
    result = 0
    for character in special_characters:
        found = message_content.find(character)
        if(found > 0):
            result += found
    return result
df["special_characters"] = df["content"].apply(lambda x: special_characters_number(x))
def lore_occurences_in_message(message_content: str) -> int:
    result = 0
    for region_id, region_values in regions.items():
        message_content = message_content.lower()
        for region_value in region_values:
            result += message_content.count(region_value)
    for lore_word in other_lore_words:
        result += message_content.count(lore_word)
    return result
df["lore_phrases"] = df["content"].apply(lambda x: lore_occurences_in_message(x))
df = df.drop(columns=["attachments", "reactions", "edited_at", "referenced_message"])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
fig = make_subplots(rows = 7, cols = 2)
num_cols = df.select_dtypes(include='number').columns
row_num, col_num = 1, 1
for col in num_cols:
  fig.add_trace(go.Box(y=df[col], name=col), row=row_num, col=col_num)
  if col_num == 2:
    col_num = 1
    row_num += 1
  else:
    col_num += 1
    print(row_num)
fig.update_layout(height=1300, width=600, title_text="Side By Side Subplots")
fig.show()

the data is quite dirty even after filtering log channels and bot messages, but at the same time the data is quite interesting

# Plot server messages
We can clearly see several boom periods. What's interesting is that messages on non-lore channels grew with messages on lore channels, pointing to a correlation between the two.

In [None]:
creation_date_series = df.copy()
creation_date_series["created_at"] = creation_date_series["created_at"].dt.date
creation_date_series = creation_date_series["created_at"].value_counts()
creation_date_series = creation_date_series.sort_index()

all_channels_fig = px.bar(creation_date_series, 
                    x = creation_date_series.index, 
                    y = "count",
                    title="Messages on the server",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['purple'],
)

for trace in all_channels_fig.data:
    trace.opacity = 1
    all_channels_fig.add_trace(trace)
all_channels_fig.update_traces(marker_line_width=0)
all_channels_fig.show()

In [None]:
creation_date_series_lore: pd.DataFrame = df[df["channel_id"].isin(lore_channels)].copy()
creation_date_series_lore["created_at"] = creation_date_series_lore["created_at"].dt.date
creation_date_series_lore = creation_date_series_lore["created_at"].value_counts()
creation_date_series_lore = creation_date_series_lore.sort_index()

lore_channels_fig = px.bar(creation_date_series_lore, 
                    x = creation_date_series_lore.index, 
                    y = "count",
                    title="Messages on lore channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['red'],
)

for trace in lore_channels_fig.data:
    trace.opacity = 1
    lore_channels_fig.add_trace(trace)
lore_channels_fig.update_traces(marker_line_width=0)
lore_channels_fig.show()

In [None]:
creation_date_series_offtop: pd.DataFrame = df[~df["channel_id"].isin(lore_channels)].copy()
creation_date_series_offtop["created_at"] = creation_date_series_offtop["created_at"].dt.date
creation_date_series_offtop = creation_date_series_offtop["created_at"].value_counts()
creation_date_series_offtop = creation_date_series_offtop.sort_index()

offtop_channels_fig = px.bar(creation_date_series_offtop, 
                    x = creation_date_series_offtop.index, 
                    y = "count",
                    title="Messages on offtop channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['blue'],
)

for trace in offtop_channels_fig.data:
    trace.opacity = 1
    offtop_channels_fig.add_trace(trace)
offtop_channels_fig.update_traces(marker_line_width=0)
offtop_channels_fig.show()


In [None]:
fig = go.Figure()

creation_date_series_offtop_normalized = creation_date_series_offtop.copy()
creation_date_series_lore_normalized = creation_date_series_lore.copy()

all_channels_fig_probability = px.bar(creation_date_series_offtop_normalized, 
                    x = creation_date_series_offtop_normalized.index, 
                    y = "count",
                    title="Messages on all channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['blue'],
)

lore_channels_fig_probability = px.bar(creation_date_series_lore_normalized, 
                    x = creation_date_series_lore_normalized.index, 
                    y = "count",
                    title="Messages on lore channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['red']
)

for trace in all_channels_fig_probability.data:
    trace.opacity = 1
    fig.add_trace(trace)
for trace in lore_channels_fig_probability.data:
    trace.opacity = 0.5
    fig.add_trace(trace)

fig.update_traces(marker_line_width=0)
fig.update_layout(
    title="Lore channels and offtop channels",
    xaxis_title="Date",
    yaxis_title="Messages to max ratio",
    barmode='overlay',
    template="plotly",
    bargap=0,
    bargroupgap=0
)

fig.show()

In [None]:
fig = go.Figure()

creation_date_series_offtop_normalized = creation_date_series_offtop.copy()
creation_date_series_lore_normalized = creation_date_series_lore.copy()
creation_date_series_offtop_normalized  = creation_date_series_offtop_normalized / creation_date_series_offtop_normalized.max()
creation_date_series_lore_normalized = creation_date_series_lore_normalized / creation_date_series_lore_normalized.max()

all_channels_fig_probability = px.bar(creation_date_series_offtop_normalized, 
                    x = creation_date_series_offtop_normalized.index, 
                    y = "count",
                    title="Messages on all channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['blue'],
)

lore_channels_fig_probability = px.bar(creation_date_series_lore_normalized, 
                    x = creation_date_series_lore_normalized.index, 
                    y = "count",
                    title="Messages on lore channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['red']
)

for trace in all_channels_fig_probability.data:
    trace.opacity = 1
    fig.add_trace(trace)
for trace in lore_channels_fig_probability.data:
    trace.opacity = 0.5
    fig.add_trace(trace)

fig.update_traces(marker_line_width=0)
fig.update_layout(
    title="Lore channels and offtop channels",
    xaxis_title="Date",
    yaxis_title="Messages to max ratio",
    barmode='overlay',
    template="plotly",
    bargap=0,
    bargroupgap=0
)

fig.show()

In [None]:
creation_date_series_all_yearly: pd.DataFrame = df[~df["channel_id"].isin(lore_channels)].copy()
creation_date_series_all_yearly["created_at"] = creation_date_series_all_yearly["created_at"].dt.date
creation_date_series_all_yearly = creation_date_series_all_yearly["created_at"].value_counts()
creation_date_series_all_yearly = creation_date_series_all_yearly.sort_index()

creation_date_series_all_yearly.index = pd.to_datetime(creation_date_series_all_yearly.index).year

offtop_channels_fig = px.histogram(creation_date_series_all_yearly, 
                    x = creation_date_series_all_yearly.index, 
                    y = "count",
                    title="Messages on offtop channels yearly",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group")
offtop_channels_fig.show()


In [None]:
creation_date_series_lore_yearly: pd.DataFrame = df[df["channel_id"].isin(lore_channels)].copy()
creation_date_series_lore_yearly["created_at"] = creation_date_series_lore_yearly["created_at"].dt.date
creation_date_series_lore_yearly = creation_date_series_lore_yearly["created_at"].value_counts()
creation_date_series_lore_yearly = creation_date_series_lore_yearly.sort_index()

creation_date_series_lore_yearly.index = pd.to_datetime(creation_date_series_lore_yearly.index).year

offtop_channels_fig = px.histogram(creation_date_series_lore_yearly, 
                    x = creation_date_series_lore_yearly.index, 
                    y = "count",
                    title="Messages on lore channels yearly",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    color_discrete_sequence=['red'],
                    barmode="group")
offtop_channels_fig.show()


In [None]:
fig = go.Figure()

creation_date_series_offtop_normalized = creation_date_series_all_yearly.copy()
creation_date_series_lore_normalized = creation_date_series_lore_yearly.copy()

creation_date_series_offtop_normalized = creation_date_series_offtop_normalized.groupby(level=0).sum()
creation_date_series_lore_normalized = creation_date_series_lore_normalized.groupby(level=0).sum()

creation_date_series_offtop_normalized  = creation_date_series_offtop_normalized / creation_date_series_offtop_normalized.max()
creation_date_series_lore_normalized = creation_date_series_lore_normalized / creation_date_series_lore_normalized.max()

all_channels_fig_probability = px.bar(creation_date_series_offtop_normalized, 
                    x = creation_date_series_offtop_normalized.index, 
                    y = "count",
                    title="Messages on all channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['blue'],
)

lore_channels_fig_probability = px.bar(creation_date_series_lore_normalized, 
                    x = creation_date_series_lore_normalized.index, 
                    y = "count",
                    title="Messages on lore channels",
                    labels={
                        "count": "Count",
                        "created_at": "Date"
                    },
                    barmode="group",
                    color_discrete_sequence=['red']
)

for trace in all_channels_fig_probability.data:
    trace.opacity = 1
    fig.add_trace(trace)
for trace in lore_channels_fig_probability.data:
    trace.opacity = 0.5
    fig.add_trace(trace)

fig.update_traces(marker_line_width=0)
fig.update_layout(
    title="Lore channels on top of offtop channels yearly",
    xaxis_title="Date",
    yaxis_title="Messages to max ratio",
    barmode='overlay',
    template="plotly",
    bargap=0,
    bargroupgap=0
)

fig.show()

It is clear that pandemic time of 2020 was the maximum point both for lore and non-lore participation. A trendline can also be seen (ignoring years of 2017-2018 when a lot of lore was being written on the offtop channels), pointing to a shift of the server's profile from RP-based to more community based server where lore is a secondary endeavour.

# Server messages by weekday
What's interesting is that while channels that are related to real-life show decrease on weekends, the lore channels experience a decrease on thursdays, but on a smaller scale.

In [None]:
from matplotlib import pyplot as plt

def plot_graph(weekday_df: pd.DataFrame, label: str):
    weekday_df["created_at"] = pd.to_datetime(weekday_df["created_at"], utc=True, format="%Y-%m-%d %H:%M:%S.%f%z", errors='coerce').fillna(
            pd.to_datetime(weekday_df["created_at"], utc=True, format="%Y-%m-%d %H:%M:%S%z", errors='coerce')
        )
    weekday_df["weekday"] = weekday_df['created_at'].dt.weekday
    weekday_df = weekday_df.groupby("weekday").count()
    all_messages_count = weekday_df["created_at"].sum()
    if(all_messages_count == 0):
        all_messages_count = 1
    weekday_df["created_at"] = weekday_df['created_at']/all_messages_count

    day_dict = {
        0: 'Monday',
        1: 'Tuesday',
        2: 'Wednesday',
        3: 'Thursday',
        4: 'Friday',
        5: 'Saturday',
        6: 'Sunday'
    }

    ax = None
    weekday_df["weekday_name"] = weekday_df.index.to_series().replace(day_dict)
    ax = weekday_df.plot(ax=ax,x="weekday_name", xlabel="Weekday", y="created_at", ylabel="Messages", title=label, ylim=0, legend=False)
    plt.show(ax)

plot_graph(df.copy(), "All channels")
plot_graph(df[df["channel_id"].isin(lore_channels)].copy(), "Lore channels")
plot_graph(df[~df["channel_id"].isin(lore_channels)].copy(), "Offtop channels")
plot_graph(df[df["channel_id"].isin([576027933634854912])].copy(), "Real-life related channels")

# Participation by user

One user with a lot of activity (me) who is a server owner can be seen. Others are pretty equally active, but some have more lore participation than others.

In [None]:
user_participation_df = df[~df["channel_id"].isin(lore_channels)]
user_participation_df = user_participation_df.groupby("author_id").count()
user_participation_df["name"] = user_participation_df.index.map(get_user_name)
fig = px.pie(user_participation_df, values='id', names='name', title='Offtop user participation on the server')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
user_participation_lore_df = df[df["channel_id"].isin(lore_channels)]
user_participation_lore_df = user_participation_lore_df.groupby("author_id").count()
user_participation_lore_df["name"] = user_participation_lore_df.index.map(get_user_name)
fig = px.pie(user_participation_lore_df, values='id', names='name', title='Lore user participation on the server')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

# Maps
Maps were the most interesting part for me personally, but analitically only 2 things stand out - active human-led countries have much higher mention ratio and NPC countries that have historically been important or funny have a higher ratio as well.

In [None]:
from urllib.request import urlopen
import json
with urlopen('file:///home/stefan/Downloads/features(61).geojson') as response:
    counties = json.load(response)

region_occurence_all_analysis: pd.DataFrame = df.copy()

region_occurence_count: dict[str, list[str]] = {}
for region_id, region_values in regions.items():
  region_occurence_count[region_id] = 0
  for message_content in region_occurence_all_analysis["content"]:
        message_content = message_content.lower()
        for region_value in region_values:
            region_occurence_count[region_id] += message_content.count(region_value)

import plotly.express as px

region_occurence_count = pd.DataFrame.from_dict(region_occurence_count, orient='index', columns=["occurence"])
fig = px.choropleth(region_occurence_count, geojson=counties, locations=region_occurence_count.index, color="occurence",
                           color_continuous_scale="Viridis",
                           labels={'unemp':'unemployment rate'},
                            featureidkey="properties.id",
                            title="Country mentions"
                          )
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":100,"l":0,"b":0})
fig.show()


In [None]:
region_occurence_all_analysis: pd.DataFrame = df.copy()

region_occurence_count: dict[str, list[str]] = {}
for region_id, region_values in regions.items():
  region_occurence_count[region_id] = 0
  for message_content in region_occurence_all_analysis["content"]:
        message_content = message_content.lower()
        for region_value in region_values:
            region_occurence_count[region_id] += message_content.count(region_value)

region_occurence_count = pd.DataFrame.from_dict(region_occurence_count, orient='index', columns=["occurence"])
region_occurence_count_down = region_occurence_count[region_occurence_count["occurence"] < region_occurence_count["occurence"].quantile(0.70)]
fig = px.choropleth(region_occurence_count_down, geojson=counties, locations=region_occurence_count_down.index, color="occurence",
                           color_continuous_scale="Viridis",
                           labels={'unemp':'unemployment rate'},
                            featureidkey="properties.id",
                            title="Country mentions with top countries removed"
                          )
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":150,"l":0,"b":0})
fig.show()

In [None]:
region_occurence_all_analysis: pd.DataFrame = df.copy()

region_occurence_count: dict[str, list[str]] = {}
for region_id, region_values in regions.items():
  region_occurence_count[region_id] = 0
  for message_content in region_occurence_all_analysis["content"]:
        message_content = message_content.lower()
        for region_value in region_values:
            region_occurence_count[region_id] += message_content.count(region_value)

region_occurence_count = pd.DataFrame.from_dict(region_occurence_count, orient='index', columns=["occurence"])
region_occurence_count_non_human = region_occurence_count[~region_occurence_count.index.isin(human_countries)]
fig = px.choropleth(region_occurence_count_non_human, geojson=counties, locations=region_occurence_count_non_human.index, color="occurence",
                           color_continuous_scale="Viridis",
                           labels={'unemp':'unemployment rate'},
                            featureidkey="properties.id",
                            title="NPC-countries mentions"
                          )
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":150,"l":0,"b":0})
fig.show()

# Machine Learning
I decided that i need a machine learning model that will classify messages as "being lore or a lore related discussion" to search through a big, old and mixed channel that has some old lore messages buried by the offtopic spam beneath.

After analysing numerous messages' contents i realized that lore messages usually differ in: lore phrases mentions (obviously), special characters use (when people get serious they care more about punctuation) and activity around the message (random memes about lore contain lore phrases but rarely any activity beyond a simple "XD").

I have tried randomforest and xgboost models. Xgboost achieved better accuracy and precision, but it's lower recall for lore messages made it slightly worse when tested on channels outside the dataset. Is is also interesting that Xgboost decided that special characters are more important than lore phrases in followup messages, which may explain differences in accuracy and recall.

In [None]:
words_to_lore_phrases = df.copy()

def replace_emojis(message_content: str):
    import re
    emoji_char = "�"
    result = re.sub("<.+:.+>", emoji_char, message_content)
    for emoji in emojis:
        result = result.replace(emoji, emoji_char)
    return result
words_to_lore_phrases["content"] = words_to_lore_phrases["content"].apply(lambda x: replace_emojis(x))

def test(x):
    window_length = "3h"
    result = x.rolling(window_length, center=True, on="created_at")["created_at"].count()
    lore_roll = x.rolling(window_length, center=True, on="created_at")["lore_phrases"].sum()
    special_characters_roll = x.rolling(window_length, center=True, on="created_at")["special_characters"].sum()
    result = result.to_frame()
    result["lore_phrases"] = lore_roll
    result["special_characters"] = special_characters_roll
    return result
b = words_to_lore_phrases.groupby('channel_id').apply(lambda group: test(group))
b = b.reset_index(level=0)
words_to_lore_phrases["followup_messages"] = b["created_at"]
words_to_lore_phrases["followup_messages_lore"] = b["lore_phrases"]
words_to_lore_phrases["followup_messages_special_characters"] = b["special_characters"]

In [None]:
words_to_lore_phrases_channels = words_to_lore_phrases.groupby("channel_id").agg(
    total_words=("words", 'sum'),
    total_lore_phrases=("lore_phrases", 'sum'),
    total_attachments=("attachments_count", "sum"),
    avg_message_length=("message_length", "median"),
    avg_followup_messages=("followup_messages", "mean")
)
words_to_lore_phrases_channels = words_to_lore_phrases_channels[words_to_lore_phrases_channels["total_words"] >= 200]
words_to_lore_phrases_channels["channel_name"] = words_to_lore_phrases_channels.index.map(get_channel_name)
words_to_lore_phrases_channels["lore_to_words_ratio"] = words_to_lore_phrases_channels["total_lore_phrases"]/words_to_lore_phrases_channels["total_words"]
words_to_lore_phrases_channels["attachments_lore_to_words_ratio"] = words_to_lore_phrases_channels["lore_to_words_ratio"]*words_to_lore_phrases_channels["total_attachments"]
words_to_lore_phrases_channels["word_length_altw"] = words_to_lore_phrases_channels["attachments_lore_to_words_ratio"]*words_to_lore_phrases_channels["avg_message_length"]
words_to_lore_phrases_channels["all_variables"] = words_to_lore_phrases_channels["word_length_altw"]*words_to_lore_phrases_channels["avg_followup_messages"]
words_to_lore_phrases_channels

# Random Forest

In [None]:
# Modelling
from matplotlib import pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz, plot_tree
from IPython.display import Image
import graphviz
from xgboost import plot_importance

def is_lore_message(channel_id):
    if(channel_id in lore_channels):
        return 1
    else:
        return 0

words_to_lore_phrases_ml = words_to_lore_phrases.copy()
words_to_lore_phrases_ml = words_to_lore_phrases_ml[words_to_lore_phrases_ml["channel_id"].isin(ml_channels)]
words_to_lore_phrases_ml["lore_message"] = words_to_lore_phrases_ml["channel_id"].apply(lambda x: is_lore_message(x))

fields = [
    "id",
    "content",
    "author_id",
    "thread_messages",
    "created_at",
    "pinned",
    "channel_id"
]
words_to_lore_phrases_ml = words_to_lore_phrases_ml.drop(fields, axis=1)

X = words_to_lore_phrases_ml.drop("lore_message", axis=1)
y = words_to_lore_phrases_ml["lore_message"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

for i in range(2): #red is 0, blue is 1
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

feature_names = ['Words', 'Message length', 'Attachments count', 'Special characters', 'Lore phrases', 'Followup messages', 'Followup messagers lore phrases', 'Followup messages special characters']
import pandas as pd
pd.Series(tree.feature_importances_, index=feature_names).plot.bar()

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# XGBoost

In [None]:
# Modelling
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

# Tree Visualisation
from sklearn.tree import export_graphviz, plot_tree
from IPython.display import Image
import graphviz
from xgboost import XGBClassifier

def is_lore_message(channel_id):
    if(channel_id in lore_channels):
        return 1
    else:
        return 0

words_to_lore_phrases_ml = words_to_lore_phrases.copy()
words_to_lore_phrases_ml = words_to_lore_phrases_ml[words_to_lore_phrases_ml["channel_id"].isin(ml_channels)]
words_to_lore_phrases_ml["lore_message"] = words_to_lore_phrases_ml["channel_id"].apply(lambda x: is_lore_message(x))

fields = [
    "id",
    "content",
    "author_id",
    "thread_messages",
    "created_at",
    "pinned",
    "channel_id"
]
words_to_lore_phrases_ml = words_to_lore_phrases_ml.drop(fields, axis=1)

X = words_to_lore_phrases_ml.drop("lore_message", axis=1)
y = words_to_lore_phrases_ml["lore_message"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
from xgboost import plot_importance
feature_names = ['Words', 'Message length', 'Attachments count', 'Special characters', 'Lore phrases', 'Followup messages', 'Followup messagers lore phrases', 'Followup messages special characters']
xgb_model.get_booster().feature_names = feature_names
plot_importance(xgb_model, importance_type='weight')
plt.title("Feature Importance (XGBoost)")
plt.show()


In [None]:
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Test model on new channel

In [None]:
#scroll through mixed channel and see the results
words_to_lore_phrases_ml = words_to_lore_phrases.copy()
words_to_lore_phrases_ml = words_to_lore_phrases_ml[words_to_lore_phrases_ml["channel_id"].isin(dig_channels)]

fields = [
    "id",
    "content",
    "author_id",
    "thread_messages",
    "created_at",
    "pinned",
    "channel_id"
]
words_to_lore_phrases_ml = words_to_lore_phrases_ml.drop(fields, axis=1)

words_to_lore_phrases_ml.rename(columns={
    'words': 'Words', 
    'lore_phrases': 'Lore phrases', 
    'special_characters': 'Special characters', 
    'attachments_count': 'Attachments count', 
    'message_length': 'Message length', 
    'followup_messages': 'Followup messages', 
    'followup_messages_lore': 'Followup messagers lore phrases', 
    'followup_messages_special_characters': 'Followup messages special characters'
}, inplace=True)

words_to_lore_phrases_ml["lore_message"] = xgb_model.predict(words_to_lore_phrases_ml)
words_to_lore_phrases_pred = words_to_lore_phrases.copy()
words_to_lore_phrases_pred = words_to_lore_phrases_pred[words_to_lore_phrases_pred["channel_id"].isin(dig_channels)]
words_to_lore_phrases_pred["lore_message"] = words_to_lore_phrases_ml["lore_message"]
words_to_lore_phrases_pred