# Project Deliverable 2,3

# Social Media Post Recommender System

# 1. Preprocessing of Dataset

**1.1 Reading Dataset From CSV file**

In [3]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv("G2FinalDatasetReddit.csv")
print(df.head())


       subreddit                                               body  \
0  gameofthrones  Your submission has been automatically removed...   
1            aww  Dont squeeze her with you massive hand, you me...   
2         gaming  It's pretty well known and it was a paid produ...   
3           news  You know we have laws against that currently c...   
4       politics  Yes, there is a difference between gentle supp...   

   controversiality  score  
0                 0      1  
1                 0     19  
2                 0      3  
3                 0     10  
4                 0      1  


**1.2 Checking Null or Missing Values in Dataset** 

In [2]:
missingdf = df.isnull().sum()
missingdf[missingdf >0]


Series([], dtype: int64)

**1.3 Cleaning body Column Which COntains Comments Data**

In [4]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)  
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_body'] = df['body'].apply(clean_text)
df['clean_body'] 

0         your submission has been automatically removed...
1         dont squeeze her with you massive hand you mea...
2         its pretty well known and it was a paid produc...
3         you know we have laws against that currently c...
4         yes there is a difference between gentle suppr...
                                ...                        
160394    lol im a spanish teacher and a kid spoiled it ...
160395                                    yup glad hes good
160396    i can literally see your shit eating grin afte...
160397    if this is a dank meme upvote this comment if ...
160398    the persecution of italianamericans persists e...
Name: clean_body, Length: 160399, dtype: object

In [6]:
df = df[df['clean_body'].str.len() >= 10].reset_index(drop=True)
df

Unnamed: 0,subreddit,body,controversiality,score,clean_body
0,gameofthrones,Your submission has been automatically removed...,0,1,your submission has been automatically removed...
1,aww,"Dont squeeze her with you massive hand, you me...",0,19,dont squeeze her with you massive hand you mea...
2,gaming,It's pretty well known and it was a paid produ...,0,3,its pretty well known and it was a paid produc...
3,news,You know we have laws against that currently c...,0,10,you know we have laws against that currently c...
4,politics,"Yes, there is a difference between gentle supp...",0,1,yes there is a difference between gentle suppr...
...,...,...,...,...,...
159650,memes,lol i’m a spanish teacher and a kid spoiled it...,0,68,lol im a spanish teacher and a kid spoiled it ...
159651,nba,Yup. Glad he's good.,0,1,yup glad hes good
159652,wallstreetbets,I can literally see your shit eating grin afte...,0,2,i can literally see your shit eating grin afte...
159653,dankmemes,"If this is a dank meme, **Upvote** this commen...",0,1,if this is a dank meme upvote this comment if ...


**1.4 Saving Cleaned Dataset into New CSV File**

In [None]:
df.to_csv("Cleaned_Reddit_Comments.csv", index=False)

**1.5 Check for empty strings in 'body' and 'clean_body'**

In [None]:
empty_strings = (df['body'].str.strip() == '').sum()
empty_clean_strings = (df['clean_body'].str.strip() == '').sum()

print("Empty 'body' entries:", empty_strings)
print("Empty 'clean_body' entries:", empty_clean_strings)


**1.6 Rows with special characters**

In [None]:
import re

special_char_rows = df[df['clean_body'].str.contains(r'[^a-z\s]', regex=True)]
print(" TotalRows with special characters:", len(special_char_rows))


**1.7 Look for 'http' or 'www' in clean_body**

In [None]:
url_rows = df[df['clean_body'].str.contains(r'(http|www\.)', regex=True)]
print("Total Rows with URLs:", len(url_rows))


# 2. Embedding Using LLm

**2.1 Instaling transformers and torch framework library**

In [None]:
#!pip install transformers
#!pip install torch

**2.2 Importing transformers library**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

texts = df['clean_body'].astype(str).tolist()



**2.3 Initiallizing tokenizer and choosing Model for Embeddings**

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)



**2.4 Checking If the Cuda interface is Available**

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)



**2.5 Method For Embeddings**

In [None]:
def get_embeddings(text_list):
    embeddings = []
    for text in text_list:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden.size()).float()
            masked_hidden = last_hidden * mask
            summed = torch.sum(masked_hidden, 1)
            summed_mask = torch.clamp(mask.sum(1), min=1e-9)
            mean_pooled = summed / summed_mask
            embeddings.append(mean_pooled.cpu().numpy())
    return embeddings



**2.6 Calling Embeddings Method and Storing Embeddings**

In [None]:
embeddings = get_embeddings(texts)

import numpy as np
embeddings = np.vstack(embeddings)

np.save('reddit_embeddings.npy', embeddings)

print("Embeddings shape:", embeddings.shape)
