In [7]:
import os

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [8]:
cwd = os.getcwd()
parent_dir = os.path.abspath(os.path.join(cwd, os.pardir))
data_dir = os.path.join(parent_dir, "data")

In [9]:
data = pd.read_json(
    os.path.join(data_dir, "personalities_split_3comments_10_300_upsamples.json")
)

In [10]:
len(data)

500516

In [11]:
def group_and_clear(data, comments_threshold=7):
    """Clears data from duplicates and authors with less than threshold comments"""
    # data.drop_duplicates(inplace=True)
    original_length = len(data)
    data_grouped = (
        data.groupby("author")
        .size()
        .reset_index(name="count")
        .sort_values(by="count", ascending=False)
    )
    data_grouped = data_grouped.loc[data_grouped["count"] >= comments_threshold]
    authors = data_grouped["author"].tolist()
    data = data.loc[data["author"].isin(authors)]
    print(f"Original length:{original_length}, after cleanup: {len(data)}")
    return data

In [12]:
data = group_and_clear(data, comments_threshold=7)

Original length:500516, after cleanup: 248869


In [13]:
def sentence_length(data, threshold=5, smaller=False):
    """Removes posts with less than threshold of words"""
    original_length = len(data)
    if smaller:
        data["comment"] = data["comment"].map(
            lambda x: x if len(x.split()) <= threshold else ""
        )
    else:
        data["comment"] = data["comment"].map(
            lambda x: x if len(x.split()) >= threshold else ""
        )
    data = data.loc[data["comment"] != ""]
    print(f"Original length:{original_length}, after cleanup: {len(data)}")
    return data

In [14]:
data = sentence_length(data, threshold=20, smaller=False)

Original length:248869, after cleanup: 113702


## Personality plots ##

In [15]:
data.dropna(inplace=True)

df_author_mean_personality = (
    data[["author", "I", "N", "T", "J"]].groupby("author").mean()
)
df_author_genre = data.groupby("author")["genre"].apply(
    lambda x: x.value_counts().idxmax()
)


df_author_channel = data.groupby("author")["channel"].apply(
    lambda x: x.value_counts().idxmax()
)


df_author_mean_personality_genre = pd.concat(
    [df_author_mean_personality, df_author_genre, df_author_channel], axis=1
)

In [16]:
columns_to_scale = ["I", "N", "T", "J"]

scaler = MinMaxScaler()

df_author_mean_personality_genre[columns_to_scale] = scaler.fit_transform(
    df_author_mean_personality_genre[columns_to_scale]
)

In [17]:
personality_mapping = {"I": "E", "N": "S", "T": "F", "J": "P"}

# Apply the personality mapping and combine the personality values into strings
df_author_mean_personality_genre["I"] = df_author_mean_personality_genre["I"].apply(
    lambda x: "E" if x > 0.5 else "I"
)
df_author_mean_personality_genre["N"] = df_author_mean_personality_genre["N"].apply(
    lambda x: "S" if x > 0.5 else "N"
)
df_author_mean_personality_genre["T"] = df_author_mean_personality_genre["T"].apply(
    lambda x: "F" if x > 0.5 else "T"
)
df_author_mean_personality_genre["J"] = df_author_mean_personality_genre["J"].apply(
    lambda x: "P" if x > 0.5 else "J"
)

# Combine the personality values into strings
df_author_mean_personality_genre["personality"] = df_author_mean_personality_genre[
    ["I", "N", "T", "J"]
].agg("".join, axis=1)

# Group by artist and count the occurrences of each personality type
result = (
    df_author_mean_personality_genre.groupby(["channel", "personality"])
    .size()
    .reset_index(name="count")
)

# Convert the result to the desired format
final_result = (
    result.groupby("channel")
    .apply(lambda x: x.set_index("personality")["count"].to_dict())
    .to_dict()
)

In [18]:
df_result = (
    pd.DataFrame(final_result)
    .fillna(0)
    .reset_index()
    .rename(columns={"index": "personality"})
)


df_melted = df_result.melt(
    id_vars="personality", var_name="artist_name", value_name="count"
).sort_values(by="count", ascending=False)

initial plot

In [21]:
import plotly.express as px

fig = px.bar(
    df_melted,
    x="artist_name",
    y="count",
    color="personality",
    labels={
        "count": "Count",
        "artist_name": "Artist Name",
        "personality": "Personality Type",
    },
    title="Personality Types Distribution by Artist",
    width=1500,
    height=800,
)
fig.show()

For chosen artists and in percentage

In [27]:
desired_arists = [
    "anthrax",
    "ed_sheeran",
    "ray_charles",
    "mozart",
    "metallica",
    "billie_eilish",
    "pinkfloyd",
    "taylor_swift",
    "kanye_west",
]


df_melted = df_result.melt(
    id_vars="personality", var_name="artist_name", value_name="count"
).sort_values(by="count", ascending=False)


df_melted = df_melted.loc[df_melted["artist_name"].isin(desired_arists)]
total_counts = df_melted.groupby("artist_name")["count"].sum()


df_melted["percentage"] = (
    df_melted["count"] / df_melted["artist_name"].map(total_counts) * 100
)


genre_mapping = df_author_mean_personality_genre.set_index("channel")["genre"].to_dict()


df_melted["genre"] = df_melted["artist_name"].map(genre_mapping)
df_melted = df_melted.sort_values(by="percentage", ascending=False)


fig = px.bar(
    df_melted,
    x="artist_name",
    y="percentage",
    color="personality",
    labels={
        "percentage": "Percentage",
        "artist_name": "Artist Name",
        "personality": "Personality Type",
    },
    title="Personality Types Distribution by Artist",
    category_orders={"artist_name": sorted(df_melted["artist_name"].unique())},
    height=500,
    width=1500,
)
fig.show()

For chosen genre(all artits) and in percentage

In [26]:
df_melted = df_result.melt(
    id_vars="personality", var_name="artist_name", value_name="count"
).sort_values(by="count", ascending=False)

total_counts = df_melted.groupby("artist_name")["count"].sum()


df_melted["percentage"] = (
    df_melted["count"] / df_melted["artist_name"].map(total_counts) * 100
)


genre_mapping = df_author_mean_personality_genre.set_index("channel")["genre"].to_dict()


df_melted["genre"] = df_melted["artist_name"].map(genre_mapping)
df_melted = df_melted.sort_values(by="percentage", ascending=False)


chosen_genre = "pop"
genre_df = df_melted[df_melted["genre"] == chosen_genre]

# Create the bar plot with facets based on artist_name
fig = px.bar(
    genre_df,
    x="artist_name",
    y="percentage",
    color="personality",
    labels={
        "percentage": "Percentage",
        "artist_name": "Artist Name",
        "personality": "Personality Type",
    },
    title=f"Personality Types Distribution for {chosen_genre} Genre",
    category_orders={"artist_name": sorted(genre_df["artist_name"].unique())},
)  # Optional: Sort artist names

# Show the plot
fig.show()

## Emo plots ##

In [28]:
data.head()

Unnamed: 0,comment_id,author,date,comment,video_id,is_reply,parent_id,channel,genre,emoji,emotion,mbti,I,N,T,J
34,UgwdwMoZyivskemlInd4AaABAg.9w_8W7ei0Xx9wcvEWc7Pz3,Christian Manifestation Channel,2023-11-02 23:49:23,I 39 m always a die hard fan of Bon since in...,lDK9QqIzhwk,True,UgwdwMoZyivskemlInd4AaABAg,bonjovi,metal,"['😊', '😍', '🤗', '😍', '😘', '😘']",approval,"{'type': ['E', 'S', 'F', 'J'], 'preds': [0.935...",0.935273,0.860599,0.809209,0.407116
37,Ugzj3o4e0uPwdQfjSwV4AaABAg.9wVkpgboc0h9x0T8O7VEkb,Gary Carr,2023-11-12 12:36:09,One of my song of all time It ’ s just simpl...,lDK9QqIzhwk,True,Ugzj3o4e0uPwdQfjSwV4AaABAg,bonjovi,metal,['❤️'],love,"{'type': ['E', 'S', 'F', 'P'], 'preds': [0.955...",0.955575,0.858568,0.823778,0.594128
50,UgwCY7MWnT7m-fTfeNV4AaABAg.9w5GGawcewi9w5IX7pChaJ,Dante,2023-10-20 13:08:13,so far 1969 The original by Shocking Bl...,lDK9QqIzhwk,True,UgwCY7MWnT7m-fTfeNV4AaABAg,bonjovi,metal,"[': (TW:', ': The original Venus by Shocking B...",neutral,"{'type': ['I', 'N', 'T', 'P'], 'preds': [0.294...",0.294976,0.291326,0.470297,0.59786
51,UgwCY7MWnT7m-fTfeNV4AaABAg.9w5GGawcewi9w5xBJHk8gI,goober,2023-10-20 19:12:14,so far 1969 The original by Shocking Bl...,lDK9QqIzhwk,True,UgwCY7MWnT7m-fTfeNV4AaABAg,bonjovi,metal,"[': (TW:', ': The original Venus by Shocking B...",neutral,"{'type': ['I', 'N', 'T', 'P'], 'preds': [0.294...",0.294976,0.291326,0.470297,0.59786
54,Ugy1N4k_CAomaQ3VbBd4AaABAg.9w0i2Alm8cG9x0TFR6PG51,Gary Carr,2023-11-12 12:37:07,this be one of the best bon ever i wish more ...,lDK9QqIzhwk,True,Ugy1N4k_CAomaQ3VbBd4AaABAg,bonjovi,metal,"['🎸', '🎸']",admiration,"{'type': ['E', 'S', 'F', 'P'], 'preds': [0.890...",0.890215,0.773742,0.897463,0.508214


Emotions as x-axis, comparison of certain emotions between particular artists

In [29]:
import plotly.express as px

desired_emotions = ["sadness", "love", "admiration", "fear", "remorse"]
desired_arists = [
    "ed_sheeran",
    "metallica",
    "billie_eilish",
    "pinkfloyd",
    "taylor_swift",
    "kanye_west",
]
grouped = data.groupby(["emotion", "channel"]).size().reset_index(name="count")

grouped = grouped.loc[grouped["emotion"].isin(desired_emotions)]
grouped = grouped.loc[grouped["channel"].isin(desired_arists)]
source_total = grouped.groupby("emotion")["count"].transform("sum")

grouped["percentage"] = (grouped["count"] / source_total) * 100
grouped = grouped.sort_values(by="percentage", ascending=False)

fig = px.histogram(
    grouped,
    x="emotion",
    y="percentage",
    color="channel",
    barmode="group",
    height=500,
    width=1400,
)
fig.update_layout(title=dict(font=dict(size=100), automargin=True, yref="paper"))
fig.show()

By genre, comparison of emotions for each artist

In [31]:
grouped = data.groupby(["genre", "channel", "emotion"]).size().reset_index(name="count")
# tu był ten jakiś przypał z nanami gatunków dla niektórych artystów
grouped["genre"].fillna("hip_hop_rap", inplace=True)
grouped["genre"] = grouped.apply(
    lambda x: "rock"
    if x["channel"] == "thebeatles"
    or x["channel"] == "systemofadown"
    or x["channel"] == "pinkfloyd"
    or x["channel"] == "bonjovi"
    else x["genre"],
    axis=1,
)


emotions = grouped["emotion"].unique().tolist()
colors = px.colors.qualitative.Pastel2[:26]
color_map = {}
for emo, color in zip(emotions, colors):
    color_map[emo] = color
grouped.drop_duplicates(inplace=True)

grouped = grouped.loc[grouped["emotion"] != "neutral"]
# gdyby chciec się pozbyc najczęstszych emocji
# grouped=grouped.loc[grouped['emotion']!="neutral" ]
# grouped=grouped.loc[grouped['emotion']!="admiration" ]
# grouped=grouped.loc[grouped['emotion']!="love" ]

source_total = grouped.groupby("channel")["count"].transform("sum")

grouped["percentage"] = (grouped["count"] / source_total) * 100
grouped = grouped.sort_values(by="percentage", ascending=False)

genres = grouped["genre"].unique().tolist()

for genre in genres:
    grouped_genre = grouped.loc[grouped["genre"] == genre]
    fig = px.histogram(
        grouped_genre,
        x="channel",
        y="percentage",
        color="emotion",
        barmode="group",
        height=400,
        color_discrete_map=color_map,
    )
    fig.update_layout(
        title=genre, xaxis_title="Artist", yaxis_title="Percentage of comments"
    )
    fig.show()

fig.show()