Start

In [None]:
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from loguru import logger

In [None]:
tag = "nlptown/bert-base-multilingual-uncased-sentiment"

In [None]:
from transformers import pipeline

# models are downloaded to ~/.cache/huggingface/hub.
# you might want to clean up that location after you are done with the models
model = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    top_k=None,
    truncation=True,  # Truncate long inputs automatically
    max_length=512,
)

# english
model("I love this movie and i would watch it again and again!")

In [None]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
datafile = (Path("..") / Path(config["processed"]) / config["wife_file"]).resolve()
if not datafile.exists():
    logger.warning(
        "Datafile does not exist. First run src/preprocess.py, and check the timestamp!"
    )
df = pd.read_parquet(datafile)
df = df.fillna("")
df = df[df['timestamp'] >= '2023-01-01']
df.head()

In [None]:
from dataclasses import dataclass


@dataclass
class Position:
    name: str
    alltext: str
    chunked: list[str]
    sentiment: dict = None


# extract the data from the dataframe
datadict = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    position = row["author"]
    message = row["message"]
    if position not in datadict:
        datadict[position] = Position(name=position, alltext=message, chunked=[])
    else:
        datadict[position].alltext += message


def split_into_chunks(text, chunk_size=512):
    # Split the text into words
    words = text.split()
    # Create chunks of the specified size
    chunks = [
        " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size)
    ]
    return chunks


# For every author, try to split their text into chunks of 512 tokens
for author in datadict:
    # Get the combined text for the author
    text = datadict[author].alltext
    # Split the text into chunks
    # we want 512 tokens, so lets guess about 50%
    datadict[author].chunked = split_into_chunks(text, chunk_size=512)

In [None]:
def fill_sentiment(model, author: Position) -> Position:
    sentiment = [model(chunk)[0] for chunk in author.chunked]
    result = {}
    for item in sentiment:
        for entry in item:
            # Get the label and the score
            label = entry["label"]
            score = round(entry["score"], 4)  # Round the score to 4 decimal places

            # Append the score to the corresponding list in the dictionary
            if label not in result:
                result[label] = (
                    []
                )  # Initialize a list if the label is not in the dictionary
            result[label].append(score)
    author.sentiment = result
    return author


for key, item in tqdm(datadict.items(), total=len(datadict)):
    try:
        datadict[key] = fill_sentiment(model, item)
    except Exception as e:
        logger.warning(f"Failed to process message {key}")
        logger.warning(f"Error: {e}")

In [None]:
# calculate collective positive average
total = []
for item in datadict.values():
    total.extend(item.sentiment["positive"])
avg = np.mean(total)

In [None]:
long_format = []
mood = "negative"
for key, item in datadict.items():
    # only keep authors with more than 10 chunks of data
    if len(item.sentiment[mood]) < 5:
        continue
    for val in item.sentiment[mood]:
        long_format.append({"name": key, mood: val})
long_df = pd.DataFrame(long_format).sort_values(by=mood, ascending=False)
long_df.head()
sns.stripplot(x="name", y=mood, data=long_df, jitter=True, alpha=0.5)
plt.axhline(avg, color="red", linestyle="--")
plt.xticks(rotation=90, ha="center")
plt.title(f"Sentiment Analysis: {mood} sentiment")

In [None]:
def agg(x):
    return (
        len(x.sentiment["positive"]),
        np.mean(x.sentiment["positive"]),
        np.median(x.sentiment["positive"]),
        np.std(x.sentiment["positive"]),
        np.mean(x.sentiment["negative"]),
        np.median(x.sentiment["negative"]),
        np.std(x.sentiment["negative"]),
    )


aggregated = [((item.name,) + agg(item)) for item in datadict.values()]

In [None]:
df_sentiment = pd.DataFrame(
    aggregated,
    columns=[
        "author",
        "count",
        "mean_pos",
        "median_pos",
        "std_pos",
        "mean_neg",
        "median_neg",
        "std_neg",
    ],
)
df_sentiment.head()

In [None]:
key = "_pos"
min_count = 2
df_filtered = df_sentiment[df_sentiment["count"] > min_count].sort_values(
    "mean" + key, ascending=False
)
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")

# Plotting for mean_pos
sns.pointplot(
    data=df_filtered,
    x="author",
    y="mean" + key,
    linestyle="none",
    capsize=0.2,
    err_kws={"linewidth": 1},
    errorbar=None,
    color="blue",
)

plt.errorbar(
    x=df_filtered["author"],
    y=df_filtered["mean" + key],
    yerr=df_filtered["std" + key] * 2,
    fmt="o",
    color="blue",
    ecolor="blue",
    elinewidth=1,
    capsize=4,
    label=f"{key} Mean with Error Bars",
)
plt.axhline(y=0.5, color="red", linestyle="--", linewidth=1)
plt.xticks(rotation=45, ha="right");  # 'ha' stands for horizontal alignment

In [None]:
from sentence_transformers import SentenceTransformer

sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(sentences)
print(f"Embedding shape: {embeddings.shape}")

In [None]:
df["message_length"] = df["message"].str.len()
sns.histplot(x=np.log(df["message_length"]))

In [None]:
subset = df[np.log(df["message_length"]) > 3].reset_index(drop=True)
subset

In [None]:
from dataclasses import dataclass
import numpy as np


@dataclass
class Embedding:
    metadata: list
    vectors: np.ndarray

    def __getitem__(self, idx: int) -> tuple:
        return (self.vectors[idx], self.metadata[idx])

    def __len__(self) -> int:
        return len(self.metadata)

    def __repr__(self) -> str:
        return f"Embedding, dims={self.vectors.shape}"

In [None]:
from tqdm import tqdm

metadata = {}
text = []
for idx, row in tqdm(subset.iterrows(), total=len(subset)):
    author = row["author"]
    message = row["message"]
    timestamp = row["timestamp"]
    metadata[idx] = {"author": author, "message": message, "timestamp": timestamp}
    text.append(message)

In [None]:
vectors = model.encode(text)
vectors.shape

In [None]:
emb = Embedding(metadata, vectors)
emb

In [None]:
X, y = emb[1]
X.shape, y

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X = pca.fit_transform(emb.vectors)
plt.figure(figsize=(10, 10))
labels = [emb.metadata[i]["author"] for i in range(len(emb.metadata))]
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=labels)
plt.legend(title="Author", bbox_to_anchor=(1.05, 1), loc="upper left")

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
X = tsne.fit_transform(emb.vectors)
plt.figure(figsize=(10, 10))
labels = [emb.metadata[i]["author"] for i in range(len(emb.metadata))]
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=labels)
plt.legend(title="Author", bbox_to_anchor=(1.05, 1), loc="upper left")