In [42]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModel
import torch
from pinecone import Pinecone, ServerlessSpec
from nltk.corpus import stopwords
import nltk

### 1. Load Datasets

In [43]:
# Load the dataset
file_path = 'data/hate_speech_labeled_data.csv'
df = pd.read_csv(file_path)

print(df.head())
print(df.info())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24

### 2. Preprocessing

In [44]:
def preprocess_text(text):
    # Remove special characters, extra spaces, and lowercase the text
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

df['processed_text'] = df['tweet'].apply(preprocess_text)

# Check processed data
print(df[['tweet', 'processed_text']].head())


                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                                      processed_text  
0  rt mayasolovely as a woman you shouldnt compla...  
1  rt mleew17 boy dats coldtyga dwn bad for cuffi...  
2  rt urkindofbrand dawg rt 80sbaby4life you ever...  
3  rt c_g_anderson viva_based she look like a tranny  
4  rt shenikaroberts the shit you hear about me m...  


In [45]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Function to clean text deeply
def deep_preprocess_text(text):
    # Remove retweet markers (RT)
    text = re.sub(r'\brt\b', '', text)
    # Remove user mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert text to lowercase
    text = text.lower()
    # Remove stopwords (optional)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
df['cleaned_text'] = df['processed_text'].apply(deep_preprocess_text)

# Preview the cleaned text
print(df[['processed_text', 'cleaned_text']].head())

                                      processed_text  \
0  rt mayasolovely as a woman you shouldnt compla...   
1  rt mleew17 boy dats coldtyga dwn bad for cuffi...   
2  rt urkindofbrand dawg rt 80sbaby4life you ever...   
3  rt c_g_anderson viva_based she look like a tranny   
4  rt shenikaroberts the shit you hear about me m...   

                                        cleaned_text  
0  mayasolovely woman shouldnt complain cleaning ...  
1  mleew17 boy dats coldtyga dwn bad cuffin dat h...  
2  urkindofbrand dawg 80sbaby4life ever fuck bitc...  
3           c_g_anderson viva_based look like tranny  
4  shenikaroberts shit hear might true might fake...  


### 3. Chunking

Since the texts in this dataset are all short sentences with clear boundaries, chunking is not necessary.

### 4. Apply Embedding Model

In [48]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Generate embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors = "pt", truncation = True, max_length = 512)
    with torch.no_grad():
        outputs = model(**inputs)
    return torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()

# Generate embeddings for all text
df['embeddings'] = df['cleaned_text'].apply(get_embedding)

# Save embeddings for reference
df.to_pickle('data/hate_speech_embeddings.pkl')


In [49]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,processed_text,cleaned_text,embeddings
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,rt mayasolovely as a woman you shouldnt compla...,mayasolovely woman shouldnt complain cleaning ...,"[0.09181488, 0.13629913, 0.33594856, 0.0092703..."
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt mleew17 boy dats coldtyga dwn bad for cuffi...,mleew17 boy dats coldtyga dwn bad cuffin dat h...,"[-0.5987987, 0.2269165, 0.027160339, -0.192366..."
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt urkindofbrand dawg rt 80sbaby4life you ever...,urkindofbrand dawg 80sbaby4life ever fuck bitc...,"[-0.26544628, -0.0700567, 0.13491961, -0.18819..."
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt c_g_anderson viva_based she look like a tranny,c_g_anderson viva_based look like tranny,"[-0.26662692, 0.36353925, -0.3158098, 0.163906..."
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt shenikaroberts the shit you hear about me m...,shenikaroberts shit hear might true might fake...,"[-0.29505986, -0.123553015, -0.050794054, -0.0..."


In [59]:
# Sample 1% from the df to upsert due to the limit of Pinecone
split_index = int(len(df) * 0.01)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

upsert_df = df[:split_index]

### 5. Upsert to Pinecone

In [39]:
pc = Pinecone(api_key="XXXXX") # Blurred credentials

index_name = "hate-speech-embeddings"

# Create Index
pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine",  
    spec=ServerlessSpec(
        cloud="aws",  
        region="us-east-1" 
    )
)

In [None]:
# Connect to the created index
index = pc.Index(index_name)

# Prepare vectors
vectors = [ 
    (str(i), upsert_df['embeddings'][i], {"label": int(upsert_df['hate_speech'][i])})
    for i in range(len(upsert_df))
]

# Upsert vectors
index.upsert(vectors)
print(f"Upserted {len(vectors)} vectors into the '{index_name}' index.")


Upserted 247 vectors into the 'hate-speech-embeddings' index.


### 6. Query Test

In [72]:
def find_similar_matches(df, query_text, index, top_k=5):
    """
    Function to query Pinecone for similar matches and return corresponding texts from the DataFrame.
    
    Parameters:
    - df: DataFrame containing the original data.
    - query_text: The input text to query for similarity.
    - index: Pinecone index object.
    - top_k: Number of top matches to retrieve (default is 5).
    
    Returns:
    - matched_texts: DataFrame of matched texts.
    """
    # Preprocess the query text
    processed_query = deep_preprocess_text(query_text)
    query_embedding = get_embedding(processed_query).tolist()
    
    # Query Pinecone for similarity
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    # Extract matched IDs from the query results
    matched_ids = [int(match['id']) for match in results['matches']]
    
    # Retrieve the corresponding texts from the DataFrame
    matched_texts = df.loc[matched_ids, ['cleaned_text']]
    
    return matched_texts

In [75]:
random_row = df.sample(1, random_state=111)
random_query_text = random_row['cleaned_text'].values[0]

# Call the function with the random query text
matched_texts = find_similar_matches(df, random_query_text, index)

# Display the matched texts
print("Query Text:", random_query_text)
print("Matched Texts:")
print(matched_texts)

Query Text: panda express trash wanted chinese didnt feel like gettin car
Matched Texts:
                                          cleaned_text
186                             midwest us white trash
222  baskgod main bitches saying hate send pic ass ...
196  consprcy_carrot blow shit get fatter white tra...
10             drake needs make mind rapper bitch haha
95   hate pulling stores niggas staring like never ...


This matches makes sense because they are all associated with racial discrimination.