In [1]:
import polars as pl

In [2]:
df = pl.read_csv("HateSpeechDatasetBalanced.csv")
df = df.unique()
df.head()

Content,Label
str,i64
"""still shove that little barn r…",1
"""the guy next to me just said t…",0
"""tell everyone i have zero hatr…",0
"""okay thanks for writing an awe…",0
"""so happy as to not be born hom…",1


In [3]:
text = df.with_columns(pl.col('Content').str.to_lowercase())

type(text)

polars.dataframe.frame.DataFrame

In [5]:
import polars as pl
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from typing import List
import time

stop_words = set(stopwords.words('english'))
exception_words = {'no', 'not', 'never'}
filtered_stopwords = stop_words - exception_words
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
    return [lemmatizer.lemmatize(word, pos="v") for word in text]

def remove_stopwords(text: str):
    return [word for word in text if word not in filtered_stopwords]


def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    start = time.time()
    xdf = df.lazy().with_columns(pl.col("Content").str.to_lowercase().alias("Content").str.split(by=' ').alias("Tokens_Content"))

    xdf = xdf.with_columns(
        pl.col("Tokens_Content").map_elements(
            lambda batch: [
                lemmatizer.lemmatize(word, pos="v") 
                for word in batch 
                if word not in filtered_stopwords and not word.isdigit()
            ],
            return_dtype=pl.List(pl.Utf8)
        ).alias("Processed_Content")).collect()
    end = time.time()
    print(f"Time taken : {(end-start):.3f}")
    return xdf

In [6]:
processed_df = preprocess(df)

Time taken : 163.409


In [7]:
sentences = processed_df['Processed_Content'].to_list()
sentences

[['still',
  'shove',
  'little',
  'barn',
  'rock',
  'star',
  'straight',
  'ass',
  'faggot'],
 ['guy', 'next', 'say', 'jews', 'turn', 'soap', 'wtf'],
 ['tell',
  'everyone',
  'zero',
  'hatred',
  'toward',
  'muslims',
  'shame',
  'think',
  'lie'],
 ['okay',
  'thank',
  'write',
  'awesome',
  'article',
  'really',
  'impress',
  'do',
  'finish',
  'read',
  'change',
  'couple',
  'minor',
  'grammatical',
  'word',
  'things',
  'leave',
  'detail',
  'edit',
  'summaries',
  'check',
  'edit',
  'make',
  'sure',
  'not',
  'wreck',
  'anything',
  'happy',
  'promote',
  'article',
  'congratulations',
  'advance'],
 ['happy', 'not', 'bear', 'homosexual'],
 ['jako',
  'normal',
  'political',
  'medicine',
  'overdose',
  'nas',
  'svi',
  'make',
  'idiot',
  'niece'],
 ['young', 'buck', 'want', 'eat', 'nigguh', 'like', 'not', 'fuck'],
 ['thetrapdoor',
  'org',
  'revert',
  'link',
  'slut',
  'remove',
  'another',
  'editor',
  'recently',
  'little',
  'spam',
  '

In [8]:
# from gensim.models import Word2Vec

# model = Word2Vec(sentences, vector_size=300, window=8, min_count=3, workers=4)
# model.save("word2vec.model")

In [None]:
from gensim.models import Word2Vec
import numpy as np
model = Word2Vec.load("word2vec.model")

def embed_word(text):
    vector_size = model.vector_size
    embeddings = [
        (model.wv[word].astype(np.float32) if word in model.wv else np.zeros(vector_size, dtype=np.float32))
        for word in text
    ]
    
    return embeddings

vs = time.time()
vector_df = processed_df.lazy().with_columns(pl.col("Processed_Content").map_elements(embed_word, return_dtype=pl.List(pl.Array(pl.Float32, 300))).alias("Vector_Content")).collect()
ve = time.time()
print(f"Time taken to train Word2Vec : {(ve-vs):.3f}")

In [23]:
df_sub = processed_df.slice(1,5)
df_sub 

Content,Label,Tokens_Content,Processed_Content
str,i64,list[str],list[str]
"""the guy next to me just said t…",0,"[""the"", ""guy"", … ""wtf""]","[""guy"", ""next"", … ""wtf""]"
"""tell everyone i have zero hatr…",0,"[""tell"", ""everyone"", … ""lying""]","[""tell"", ""everyone"", … ""lie""]"
"""okay thanks for writing an awe…",0,"[""okay"", ""thanks"", … ""advance""]","[""okay"", ""thank"", … ""advance""]"
"""so happy as to not be born hom…",1,"[""so"", ""happy"", … ""homosexual""]","[""happy"", ""not"", … ""homosexual""]"
"""jako who so normal had politic…",1,"[""jako"", ""who"", … ""niece""]","[""jako"", ""normal"", … ""niece""]"


In [39]:
vector_df_sub = df_sub.with_columns(pl.col("Processed_Content").map_elements(embed_word, return_dtype=pl.List(pl.Array(pl.Float32, 300))).alias("Vector Content"))

In [40]:
vector_df_sub

Content,Label,Tokens_Content,Processed_Content,Vector Content
str,i64,list[str],list[str],"list[array[f32, 300]]"
"""the guy next to me just said t…",0,"[""the"", ""guy"", … ""wtf""]","[""guy"", ""next"", … ""wtf""]","[[-1.667088, 0.478226, … -1.366715], [0.385363, 1.672415, … 1.485797], … [-0.456617, -0.013698, … -0.12784]]"
"""tell everyone i have zero hatr…",0,"[""tell"", ""everyone"", … ""lying""]","[""tell"", ""everyone"", … ""lie""]","[[-0.398947, 1.735775, … -0.196077], [-0.212752, 0.054329, … -1.450807], … [-0.702043, -0.777539, … -0.717452]]"
"""okay thanks for writing an awe…",0,"[""okay"", ""thanks"", … ""advance""]","[""okay"", ""thank"", … ""advance""]","[[-0.149784, 0.012655, … -0.11267], [-1.848946, 1.665252, … -0.108775], … [0.220925, 0.291135, … 0.435067]]"
"""so happy as to not be born hom…",1,"[""so"", ""happy"", … ""homosexual""]","[""happy"", ""not"", … ""homosexual""]","[[-0.241004, 1.16751, … -0.413106], [-0.718526, -0.147325, … -0.672965], … [-0.508362, 0.267781, … -0.942411]]"
"""jako who so normal had politic…",1,"[""jako"", ""who"", … ""niece""]","[""jako"", ""normal"", … ""niece""]","[[0.084353, 0.120467, … 0.073375], [-2.171593, 1.819738, … 1.855632], … [0.192221, 0.124845, … 0.332259]]"


In [32]:
model.wv["shit"].shape

(300,)