In [1]:
import pandas as pd
import numpy as np

import re

import emoji
import nltk
import string

nltk.download("words")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

words = set(nltk.corpus.words.words())

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from sklearn import decomposition
from sklearn.manifold import Isomap, TSNE
from umap import UMAP

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from pathlib import Path

DIRECTORY = os.path.abspath("..")
DATA_DIR = os.path.join(DIRECTORY, "data")
DATA_DIR

'c:\\Users\\julia\\Desktop\\sem2\\analiza_mediów\\2023-zadanie-opis-projektu-ziemniaki_rosomaki\\data'

In [5]:
combined = pd.DataFrame()
for filename in os.listdir(DATA_DIR):
    if filename != ".gitkeep" and 'combined' not in filename and 'embeddings' not in filename and 'folder' not in filename:
        print(filename)
        f = os.path.join(DATA_DIR, filename)
        df = pd.read_csv(f)
        combined = pd.concat([combined, df], ignore_index=True)

BonJovi.csv
Kaleo.csv
Kirk Fletcher.csv
MarcusKing.csv
Megadeth.csv
Megadeth2.csv
Megadeth3.csv
Megadeth4.csv
Megadeth5.csv
Megadeth6.csv
PinkFloyd.csv
RayCharles.csv
SystemOfADown.csv
TheBeatles.csv
TylerBryant&theShakedown.csv


In [18]:
grouped_channel=combined.groupby(['genre','channel']).size()
grouped_channel

genre  channel                 
blues  kaleo                       36050
       kirk fletcher                8580
       marcusking                   5591
       raycharles                   2858
       tylerbryant&theshakedown     2471
metal  bonjovi                      7444
       megadeth                    24627
       pinkfloyd                   37842
       systemofadown               36475
       thebeatles                  49334
dtype: int64

In [9]:
combined["channel"]=combined["channel"].map(lambda x: (re.sub(r'[0-9]+', '', x)).lower()) 
combined.drop_duplicates(inplace=True)

In [10]:
combined.info()


<class 'pandas.core.frame.DataFrame'>
Index: 367132 entries, 0 to 372752
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   comment_id  367132 non-null  object
 1   author      367114 non-null  object
 2   date        367132 non-null  object
 3   comment     367107 non-null  object
 4   video_id    367132 non-null  object
 5   is_reply    367132 non-null  object
 6   parent_id   66098 non-null   object
 7   channel     367132 non-null  object
 8   genre       367132 non-null  object
dtypes: object(9)
memory usage: 28.0+ MB


In [28]:
def group_and_clear(data, comments_threshold=4):
    """Clears data from duplicates and authors with less than threshold comments"""
    data.drop_duplicates(inplace=True)
    original_length = len(data)
    data_grouped = (
        data.groupby("author")
        .size()
        .reset_index(name="count")
        .sort_values(by="count", ascending=False)
    )
    data_grouped = data_grouped.loc[data_grouped["count"] >= comments_threshold]
    authors = data_grouped["author"].tolist()
    data = data.loc[data["author"].isin(authors)]
    print(f"Original length:{original_length}, after cleanup: {len(data)}")
    return data

In [29]:
# combined = group_and_clear(combined)

Original length:5220547, after cleanup: 2177322


In [11]:
def save_emojis(text):
    # TODO: save emojis in a separate column
    text = emoji.demojize(text)
    text = re.findall(r"(:[^:]*:)", text)
    list_emoji = [emoji.emojize(x) for x in text]
    return list_emoji

In [12]:
combined["emoji"] = combined["comment"].map(lambda x: save_emojis(str(x)))

In [13]:
def cleaner(post):
    post = re.sub("@[A-Za-z0-9]+", "", post)  # Remove @ sign
    post = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", post)
    post = re.sub(r"'<.*?>'", "", post)  # Remove html tags
    post = emoji.replace_emoji(post, replace="")
    post = post.replace("#", "").replace(
        "_", " "
    )  # Remove hashtag sign but keep the text
    post = " ".join(
        w
        for w in nltk.wordpunct_tokenize(post)
        if w.lower() in words or not w.isalpha()
    )
    regex = re.compile("[%s]" % re.escape(string.punctuation))
    post = regex.sub("", post)
    return post

In [14]:
combined["comment"] = combined["comment"].map(lambda x: cleaner(str(x)))

In [15]:
def sentence_length(data, threshold=5):
    """Removes posts with less than threshold of words"""
    original_length = len(data)
    data["comment"] = data["comment"].map(
        lambda x: x if len(x.split()) >= threshold else ""
    )
    data = data.loc[data["comment"] != ""]
    print(f"Original length:{original_length}, after cleanup: {len(data)}")
    return data

In [16]:
combined = sentence_length(combined)

Original length:367132, after cleanup: 211272


In [17]:
combined.to_csv(os.path.join(DATA_DIR, "combined.csv"), index=False)
combined.to_feather(os.path.join(DATA_DIR, "combined.feather"))

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    "papluca/xlm-roberta-base-language-detection", max_length=512
)
model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(
    "papluca/xlm-roberta-base-language-detection", max_length=512
)


def lang_detector(text):
    """Detects language of the post"""
    # TO DO : FIX ERROR WITH TOKENIZER LENGTH (too long sequences)
    inputs = tokenizer(
        text, return_tensors="pt", padding=True, truncation=True, max_length=512
    )
    inputs.to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    lang = model.config.id2label[predicted_class_id]
    return lang

In [20]:
combined["language"] = combined["comment"].map(lambda x: lang_detector(str(x)))

In [21]:
def clear_lang(data):
    """Deletes posts that are not in English"""
    original_length = len(data)
    data = data.loc[data["language"] == "en"]
    data.drop(columns=["language"], inplace=True)
    print(f"Original length:{original_length}, after cleanup: {len(data)}")
    return data

In [22]:
combined = clear_lang(combined)

Original length:211272, after cleanup: 191347


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=["language"], inplace=True)


In [23]:
grouped_genre=combined.groupby(['genre']).size().sort_values(ascending=False)
grouped_genre

genre
metal    139584
blues     51763
dtype: int64

In [24]:
# le = LabelEncoder()
# combined["author_label"] = le.fit_transform(combined["channel"])
# combined["genre_label"] = le.fit_transform(combined["genre"])
grouped_channel=combined.groupby(['genre','channel']).size()
grouped_channel

genre  channel                 
blues  kaleo                       32700
       kirk fletcher                8480
       marcusking                   5497
       raycharles                   2657
       tylerbryant&theshakedown     2429
metal  bonjovi                      6164
       megadeth                    22720
       pinkfloyd                   35602
       systemofadown               31358
       thebeatles                  43740
dtype: int64

In [35]:
combined.to_csv(os.path.join(DATA_DIR, "combined_final.csv"), index=False, encoding='utf-8')
# combined.to_feather(os.path.join(DATA_DIR, "combined_final.feather"))

In [36]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
sentences = combined["comment"].values.tolist()
embeddings = model.encode(sentences)

In [37]:
with open(os.path.join(DATA_DIR,'embeddings_final.npy'), 'wb') as f:
    np.save(f, embeddings)

In [32]:
grouped_channel=combined.groupby(['genre','channel']).size()
grouped_channel


2320602
2300677


In [3]:
# with open(os.path.join(DATA_DIR,'embeddings_final.npy'), 'rb') as f:
#     embeddings = np.load(f)
# combined=pd.read_csv(os.path.join(DATA_DIR, "combined_final.csv"), encoding='utf-8')