In [1]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import polars as pl
from gensim.models import Word2Vec
import json

with open(r"F:\LPU\Data\Datasets\Hate-Speech Classification\hpc_batch\keys.json", "r") as f:
    data = json.load(f)

connection_string = data.get("Connection_string")
container_name = "containerforregular"
csv_blob_name = "HPC_NEW/HateSpeechDatasetBalanced.csv"
model_blob_name = "HPC_NEW/word2vec.model"
vectors_blob_name = "HPC_NEW/word2vec.model.wv.vectors.npy"
syn1_blob_name = "HPC_NEW/word2vec.model.syn1neg.npy"
local_model_file = "downloaded_model.model"
local_vectors_file = "downloaded_model.model.wv.vectors.npy"
local_syn1_file = "downloaded_model.model.syn1neg.npy"

blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

def download_blob_to_dataframe(blob_name):
    blob_client = container_client.get_blob_client(blob_name)
    download_stream = blob_client.download_blob()
    df = pd.read_csv(download_stream)
    return df

df = download_blob_to_dataframe(csv_blob_name)
print("CSV DataFrame:")
print(df.head(3))

def download_blob_to_file(blob_name, local_file_path):
    blob_client = container_client.get_blob_client(blob_name)
    with open(local_file_path, "wb") as f:
        download_stream = blob_client.download_blob()
        f.write(download_stream.readall())

download_blob_to_file(model_blob_name, local_model_file)
download_blob_to_file(vectors_blob_name, local_vectors_file)
download_blob_to_file(syn1_blob_name, local_syn1_file)

model = Word2Vec.load(local_model_file)
print("Model loaded successfully!")

df = pl.from_pandas(df)

CSV DataFrame:
                                             Content  Label
0  denial of normal the con be asked to comment o...      1
1  just by being able to tweet this insufferable ...      1
2  that is retarded you too cute to be single tha...      1
Model loaded successfully!


In [2]:
text = df.with_columns(pl.col('Content').str.to_lowercase())

type(text)

polars.dataframe.frame.DataFrame

In [3]:
text

Content,Label
str,i64
"""denial of normal the con be as…",1
"""just by being able to tweet th…",1
"""that is retarded you too cute …",1
"""thought of a real badass mongo…",1
"""afro american basho""",1
…,…
"""i mute this telecasting and pl…",1
"""but hell yeah he s not a bache…",1
"""great video musician but s not…",1
"""not great pop video yeah he s …",1


In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from typing import List
import time
import nltk

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


stop_words = set(stopwords.words('english'))
exception_words = {'no', 'not', 'never'}
filtered_stopwords = stop_words - exception_words
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
    words = text.split()
    for x in range(len(words)):
        words[x] = lemmatizer.lemmatize(words[x], pos='v')
    return " ".join(words)

def remove_stopwords(text: str):
    words = text.lower().split()
    new_lst = []
    for x in range(len(words)):
        if words[x] not in filtered_stopwords:
            new_lst.append(words[x])
    del text
    del words
    return " ".join(new_lst)


def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    start = time.time()
    # xdf = df.lazy().with_columns(pl.col("Content").str.to_lowercase().alias("Content").str.split(by=' ').alias("Tokens_Content"))

    xdf = df.lazy().with_columns(
        pl.col("Content").map_elements(
            remove_stopwords, return_dtype=pl.Utf8
        ).alias("Stopwords_Content")).with_columns(
        pl.col("Stopwords_Content").map_elements(
            lemmatize_word, return_dtype=pl.Utf8
        ).alias("Lemmatized_Content")).collect()
    end = time.time()
    print(f"Time taken : {(end-start):.3f}")
    xdf.drop_in_place("Stopwords_Content")
    return xdf

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\SIDDHARTH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SIDDHARTH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SIDDHARTH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
processed_df = preprocess(df)

Time taken : 124.827


In [6]:
processed_df

Content,Label,Lemmatized_Content
str,i64,str
"""denial of normal the con be as…",1,"""denial normal con ask comment …"
"""just by being able to tweet th…",1,"""able tweet insufferable bullsh…"
"""that is retarded you too cute …",1,"""retard cute single life"""
"""thought of a real badass mongo…",1,"""think real badass mongol style…"
"""afro american basho""",1,"""afro american basho"""
…,…,…
"""i mute this telecasting and pl…",1,"""mute telecast play kanye west …"
"""but hell yeah he s not a bache…",1,"""hell yeah not bachelor loooooo…"
"""great video musician but s not…",1,"""great video musician not music…"
"""not great pop video yeah he s …",1,"""not great pop video yeah not p…"


In [7]:
import numpy as np

vector_size = model.vector_size
vocab = set(model.wv.index_to_key)

def embed_word(text, vector_size=vector_size, vocab=vocab):
    words = text.split()
    embeddings = [
        (model.wv[word].astype(np.float32) if word in vocab else np.zeros(vector_size, dtype=np.float32))
        for word in words
    ]
    del vector_size
    del vocab
    
    return np.mean(embeddings, axis=0)

In [8]:
import polars as pl

schema = {
    'Content': pl.Utf8,
    'Label': pl.Int64,
    'Lemmatized_Content' : pl.Utf8,
    'Vector_Content': (pl.List(pl.Float32))
}

initial_data = {
    'Content': ["sample text"],
    'Label': [1],
    'Lemmatized_Content' : ["Lemmatized this"],
    'Vector_Content': [embed_word("This is text")]
}

df2 = pl.DataFrame(initial_data)
print(df2.schema)

def r_arr(lst):
    arr = np.array(lst, dtype=np.float32)
    if arr.shape != (300,):
        raise ValueError(f"Expected shape (300,), got {arr.shape}")
    return arr.ravel().tolist()

struct_schema = pl.List(pl.Float32)
df2 = df2.with_columns(
    pl.col("Vector_Content").cast(struct_schema)
)

empty_df = df2.slice(0,0)
print(empty_df.schema)

Schema({'Content': String, 'Label': Int64, 'Lemmatized_Content': String, 'Vector_Content': Array(Float32, shape=(300,))})
Schema({'Content': String, 'Label': Int64, 'Lemmatized_Content': String, 'Vector_Content': List(Float32)})


In [110]:
import gc
gc.collect()

5549

In [12]:
def embed_word_fixed(text):
    embedded = embed_word(text)  # Your original embedding function
    # Convert to numpy array with explicit float32 type
    arr = np.array(embedded, dtype=np.float32)
    # Ensure it's exactly 300 dimensions
    if arr.shape != (300,):
        raise ValueError(f"Expected shape (300,), got {arr.shape}")
    # Return the raw array values as a fixed-size list
    return arr.ravel().tolist()

asd = processed_df.slice(0,10)
print(asd)
asd = (asd
    .with_columns(
        pl.col("Lemmatized_Content")
        .map_elements(embed_word_fixed, return_dtype=pl.List(pl.Float32))
        .alias("Vector_Content")
    )
)

asd

shape: (10, 3)
┌─────────────────────────────────┬───────┬─────────────────────────────────┐
│ Content                         ┆ Label ┆ Lemmatized_Content              │
│ ---                             ┆ ---   ┆ ---                             │
│ str                             ┆ i64   ┆ str                             │
╞═════════════════════════════════╪═══════╪═════════════════════════════════╡
│ denial of normal the con be as… ┆ 1     ┆ denial normal con ask comment … │
│ just by being able to tweet th… ┆ 1     ┆ able tweet insufferable bullsh… │
│ that is retarded you too cute … ┆ 1     ┆ retard cute single life         │
│ thought of a real badass mongo… ┆ 1     ┆ think real badass mongol style… │
│ afro american basho             ┆ 1     ┆ afro american basho             │
│ yeah retard haha                ┆ 1     ┆ yeah retard haha                │
│ the ching chong chung stuff     ┆ 1     ┆ ching chong chung stuff         │
│ the dead what a slut still war… ┆ 1     ┆ dead 

Content,Label,Lemmatized_Content,Vector_Content
str,i64,str,list[f32]
"""denial of normal the con be as…",1,"""denial normal con ask comment …","[-0.599104, 0.147318, … 0.10057]"
"""just by being able to tweet th…",1,"""able tweet insufferable bullsh…","[0.103355, -0.24982, … -0.522497]"
"""that is retarded you too cute …",1,"""retard cute single life""","[-0.758037, 0.818828, … -0.800236]"
"""thought of a real badass mongo…",1,"""think real badass mongol style…","[-0.223098, -0.119148, … -0.776917]"
"""afro american basho""",1,"""afro american basho""","[-0.515654, -0.600841, … -1.346117]"
"""yeah retard haha""",1,"""yeah retard haha""","[-1.344133, 1.072287, … -0.099435]"
"""the ching chong chung stuff""",1,"""ching chong chung stuff""","[-0.016628, 0.911098, … -0.302072]"
"""the dead what a slut still war…",1,"""dead slut still warm tweet slu…","[-0.091936, -0.212315, … -0.171015]"
"""let your tweets be harmless it…",1,"""let tweet harmless not affect …","[-0.487004, 0.735953, … -0.426335]"
"""these latinos who have a probl…",1,"""latinos problem immigration en…","[-0.129183, -0.144467, … -0.255399]"


In [13]:
import gc
import psutil
import polars as pl

def print_memory_usage(message=""):
    memory = psutil.virtual_memory().percent
    print(f"{message} - Memory Usage: {memory}%")

batch_size = len(processed_df) // 5

print_memory_usage("Before batch processing")

for i in range(5):
    print_memory_usage(f"Before batch {i+1}")

    cdf = processed_df.slice(batch_size*i, batch_size)
    x = (cdf.lazy()
         .with_columns(pl.col("Lemmatized_Content")
                       .map_elements(embed_word_fixed, return_dtype=(pl.List(pl.Float32)))
                       .alias("Vector_Content"))
         .collect())
    empty_df.vstack(x, in_place=True)
    print_memory_usage(f"After processing batch {i+1}")
    del cdf
    del x
    gc.collect()

    print_memory_usage(f"After garbage collection for batch {i+1}")
    print(f'Round {i+1} processed')
gc.collect()
print_memory_usage("After all batches processed")


if len(empty_df) < len(processed_df):
    cdf = processed_df.slice(batch_size * 5, len(processed_df) % 5)
    x = (cdf.lazy()
         .with_columns(pl.col("Lemmatized_Content")
                       .map_elements(embed_word_fixed, return_dtype=(pl.List(pl.Float32)))
                       .alias("Vector_Content"))
         .collect())

    empty_df.vstack(x, in_place=True) 

Before batch processing - Memory Usage: 50.9%
Before batch 1 - Memory Usage: 50.9%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


After processing batch 1 - Memory Usage: 51.7%
After garbage collection for batch 1 - Memory Usage: 51.7%
Round 1 processed
Before batch 2 - Memory Usage: 51.7%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


After processing batch 2 - Memory Usage: 52.4%
After garbage collection for batch 2 - Memory Usage: 52.5%
Round 2 processed
Before batch 3 - Memory Usage: 52.5%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


After processing batch 3 - Memory Usage: 53.5%
After garbage collection for batch 3 - Memory Usage: 53.6%
Round 3 processed
Before batch 4 - Memory Usage: 53.6%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


After processing batch 4 - Memory Usage: 55.1%
After garbage collection for batch 4 - Memory Usage: 55.1%
Round 4 processed
Before batch 5 - Memory Usage: 55.1%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


After processing batch 5 - Memory Usage: 56.2%
After garbage collection for batch 5 - Memory Usage: 56.2%
Round 5 processed
After all batches processed - Memory Usage: 56.2%


In [19]:
empty_df["Vector_Content"][0].shape

(300,)

In [65]:
print(type(1))

<class 'int'>


In [104]:
def check_null(lst):
    lst = lst.to_list()
    return 1 if np.sum(lst, axis=0) == 0 else 0

empty_df = empty_df.with_columns(pl.col("Vector_Content").map_elements(check_null, return_dtype=pl.Int32).alias("is_null"))
empty_df = empty_df.filter(pl.col("is_null") == 0)

In [105]:
empty_df

Label,Vector_Content,is_null
i64,list[f32],i32
1,"[-0.599104, 0.147318, … 0.10057]",0
1,"[0.103355, -0.24982, … -0.522497]",0
1,"[-0.758037, 0.818828, … -0.800236]",0
1,"[-0.223098, -0.119148, … -0.776917]",0
1,"[-0.515654, -0.600841, … -1.346117]",0
…,…,…
1,"[-0.132157, 0.292088, … -0.136933]",0
1,"[-0.178919, 0.448879, … -0.381659]",0
1,"[0.014921, 0.239474, … -0.011825]",0
1,"[-0.496958, 0.37257, … -0.330948]",0


In [106]:
final_df = empty_df.select(pl.col(["Vector_Content", "Label"]))

In [109]:
import pathlib

path : pathlib.Path = r"F:\LPU\Data\Datasets\Hate-Speech Classification\hpc_batch\final_df\vectorized_df_v1.parquet"

final_df.write_parquet(path)    

In [None]:
from azure.storage.blob import BlobServiceClient
import json
import pathlib

with open("keys.json", "r") as f:
    data = json.load(f)
    
connection_string = data.get("Connection_string")
container_name = "containerforregular/Parquet Files"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

import os
local_file_path : pathlib.Path = "embeddings_v1.parquet"
blob_name = os.path.basename(local_file_path)

blob_client = container_client.get_blob_client(blob_name)

with open(local_file_path, "rb") as data:
    blob_client.upload_blob(data, overwrite=False)

print(f"File {blob_name} uploaded to container {container_name}.")

File vectorized_df.parquet uploaded to container containerforregular/Parquet Files.
