# **Step 1: Data Collection (Reddit Scraping)**
We'll start by scraping a balanced dataset (1000 spoiler, 1000 non-spoiler comments) using your provided Reddit credentials via asyncpraw.

Goals:
* Target subreddits: r/movies, r/television, r/marvelstudios, r/MovieDetails
* Filter posts using keyword "spoiler"
* Identify spoilers via >!spoiler!< markdown
* Save structured CSV with columns: Movie, Comment, Comment Type


### **Step 1.1: Install Required Libraries**

In [None]:
!pip install asyncpraw nest_asyncio pandas

### **Step 1.2: Reddit Scraper Code**

In [None]:
import asyncpraw
import pandas as pd
import nest_asyncio
import asyncio

nest_asyncio.apply()

#reddit credentials
client_id = "EpA1si88zgqhg45ZWOsOiA"
client_secret = "6Im4euHKIkqNHhyyhKg7sAGz_XaQug"
user_agent = "SpoilerShieldBot/0.1 by Temporary_Reason5148"

#asyncPRAW reddit instance
reddit = asyncpraw.Reddit(client_id=client_id,
                          client_secret=client_secret,
                          user_agent=user_agent)

#subreddits and keywords
subreddits = ["movies", "television", "marvelstudios", "MovieDetails"]
keyword = "spoiler"

#storage
spoilers, non_spoilers = [], []

async def fetch_comments():
    global spoilers, non_spoilers

    for subreddit in subreddits:
        print(f"Scraping subreddit: r/{subreddit}")
        subreddit_obj = await reddit.subreddit(subreddit)

        async for submission in subreddit_obj.search(keyword, limit=300):
            title = submission.title
            await submission.load()
            await submission.comments.replace_more(limit=0)
            for comment in submission.comments.list():
                if not comment.body or len(comment.body) < 10:
                    continue
                body = comment.body.strip()
                if ">!" in body and "!<" in body:
                    if len(spoilers) < 1000:
                        spoilers.append({"Movie": title, "Comment": body, "Comment Type": "Spoiler"})
                elif len(non_spoilers) < 1000:
                    non_spoilers.append({"Movie": title, "Comment": body, "Comment Type": "Non-Spoiler"})

                if len(spoilers) >= 1000 and len(non_spoilers) >= 1000:
                    break

            if len(spoilers) >= 1000 and len(non_spoilers) >= 1000:
                break

    print(f"Scraping complete: {len(spoilers)} spoilers, {len(non_spoilers)} non-spoilers")

#run the async task
await fetch_comments()

#combine and save to CSV
data = pd.DataFrame(spoilers + non_spoilers)
data.to_csv("spoiler_shield_dataset.csv", index=False)
print("Saved to spoiler_shield_dataset.csv")

# **Step 2: Data Preprocessing**
The goal is to clean and normalize the comment text for model training:
*   Remove spoiler markdown (>!spoiler!< → spoiler)
*   Remove unwanted characters, links, special symbols
*   Lowercase text

### **Step 2.1: Install Preprocessing Libraries**

In [None]:
!pip install nltk

### **Step 2.2: Clean and Normalize Text**

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#load dataset
df = pd.read_csv("spoiler_shield_dataset.csv")

#preprocessing function
def clean_comment(text):
    #remove spoiler markdown >!text!< → text
    text = re.sub(r'>!(.*?)!<', r'\1', text)

    #remove links, markdown, punctuation
    text = re.sub(r'http\S+|www\S+|[\*\[\]\(\)\{\}]|[\n\r]', '', text)

    #remove special characters, keep only words
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    #lowercase and remove extra spaces
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)

    #remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

#apply cleaning
df["Cleaned Comment"] = df["Comment"].apply(clean_comment)

#check balance
print(df["Comment Type"].value_counts())

#save preprocessed dataset
df.to_csv("spoiler_shield_cleaned.csv", index=False)
print("Preprocessing complete. Saved as spoiler_shield_cleaned.csv")

# **Step 3: Contrastive Text Embedding**
We'll now create positive and negative text pairs from the cleaned dataset to train a contrastive model.
The goal is:

*   Positive pairs: same label (e.g., two spoiler comments)
*   Negative pairs: different labels (e.g., spoiler vs non-spoiler)
*   We'll use SentenceTransformer to train a model with a CosineSimilarityLoss.


### **Step 3.1: Install Required Libraries**

In [None]:
!pip install -U sentence-transformers

### **Step 3.2: Generate Contrastive Pairs**

In [None]:
from sentence_transformers import InputExample
import random

#load the cleaned dataset
df = pd.read_csv("spoiler_shield_cleaned.csv")

#convert rows into InputExample format
def generate_contrastive_pairs(df, max_pairs=1000):
    examples = []
    spoiler_df = df[df['Comment Type'] == 'Spoiler']
    non_spoiler_df = df[df['Comment Type'] == 'Non-Spoiler']

    #shuffle to ensure variety
    spoiler_df = spoiler_df.sample(frac=1).reset_index(drop=True)
    non_spoiler_df = non_spoiler_df.sample(frac=1).reset_index(drop=True)

    num_pairs = min(len(spoiler_df), len(non_spoiler_df), max_pairs)

    for i in range(num_pairs):
        #positive pair (same class)
        examples.append(InputExample(
            texts=[spoiler_df.iloc[i]['Cleaned Comment'], spoiler_df.iloc[(i+1) % num_pairs]['Cleaned Comment']],
            label=1.0
        ))
        examples.append(InputExample(
            texts=[non_spoiler_df.iloc[i]['Cleaned Comment'], non_spoiler_df.iloc[(i+1) % num_pairs]['Cleaned Comment']],
            label=1.0
        ))

        #negative pair (different class)
        examples.append(InputExample(
            texts=[spoiler_df.iloc[i]['Cleaned Comment'], non_spoiler_df.iloc[i]['Cleaned Comment']],
            label=0.0
        ))

    print(f"Total pairs created: {len(examples)}")
    return examples

#generate and store pairs
train_examples = generate_contrastive_pairs(df)

In [None]:
from sentence_transformers import InputExample
import random
import pandas as pd
import numpy as np

#load the cleaned dataset
df = pd.read_csv("spoiler_shield_cleaned.csv")

#convert 'Cleaned Comment' to string type and fill NaNs with empty string
#this handles potential non-string entries including NaNs
df['Cleaned Comment'] = df['Cleaned Comment'].astype(str).fillna('')

#filter out rows where 'Cleaned Comment' is an empty string after conversion
df = df[df['Cleaned Comment'] != ''].reset_index(drop=True)


#convert rows into InputExample format
def generate_contrastive_pairs(df, max_pairs=1000):
    examples = []
    #ensure Comment Type is also consistent for filtering
    df['Comment Type'] = df['Comment Type'].astype(str).str.lower()
    spoiler_df = df[df['Comment Type'] == 'spoiler']
    non_spoiler_df = df[df['Comment Type'] == 'non-spoiler']


    #shuffle to ensure variety
    spoiler_df = spoiler_df.sample(frac=1).reset_index(drop=True)
    non_spoiler_df = non_spoiler_df.sample(frac=1).reset_index(drop=True)

    num_pairs = min(len(spoiler_df), len(non_spoiler_df), max_pairs)

    #check if there are enough samples to create pairs
    if num_pairs < 2:
        print(f"Not enough data ({num_pairs} of each type) to create meaningful pairs.")
        return []

    for i in range(num_pairs):
        #positive pair (same class)
        #use modulo to wrap around and ensure we always have a second example
        examples.append(InputExample(
            texts=[spoiler_df.iloc[i]['Cleaned Comment'], spoiler_df.iloc[(i+1) % num_pairs]['Cleaned Comment']],
            label=1.0
        ))
        examples.append(InputExample(
            texts=[non_spoiler_df.iloc[i]['Cleaned Comment'], non_spoiler_df.iloc[(i+1) % num_pairs]['Cleaned Comment']],
            label=1.0
        ))

        #negative pair (different class)
        examples.append(InputExample(
            texts=[spoiler_df.iloc[i]['Cleaned Comment'], non_spoiler_df.iloc[i]['Cleaned Comment']],
            label=0.0
        ))

    print(f"Total pairs created: {len(examples)}")
    return examples

#generate and store pairs
train_examples = generate_contrastive_pairs(df)

# **Step 4: Model Training with Contrastive Loss**
We’ll now train a Sentence-BERT model using CosineSimilarityLoss, a perfect fit for contrastive learning. The goal is to embed similar comments close together in vector space and dissimilar ones far apart.

### **Step 4.1: Load Pretrained SentenceTransformer Model**
We’ll use distilbert-base-uncased as the backbone.

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
import os

#disable WANDB for safety
os.environ["WANDB_DISABLED"] = "true"

#limiting pairs
train_examples = train_examples[:2000]

#lightweight transformer base
word_embedding_model = models.Transformer('distilbert-base-uncased', max_seq_length=128)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

#dataLoader with moderate batch size
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

#train
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    show_progress_bar=True
)

#save model
model.save("spoiler-shield-contrastive-model")
print("Model trained and saved as 'spoiler-shield-contrastive-model'")

In [None]:
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd

#load dataset
df = pd.read_csv("spoiler_shield_cleaned.csv")

#normalize column names
df.columns = [col.lower().strip() for col in df.columns]

#lowercase comment type
df['comment type'] = df['comment type'].str.lower().str.strip()

#filter spoiler and non-spoiler cleaned comments
spoiler_texts = df[(df['comment type'] == 'spoiler') & (df['cleaned comment'].str.len() > 50)]['cleaned comment'].tolist()
non_spoiler_texts = df[(df['comment type'] == 'non-spoiler') & (df['cleaned comment'].str.len() > 50)]['cleaned comment'].tolist()

#load model
model = SentenceTransformer("spoiler-shield-contrastive-model")

#encode texts
spoiler_embs = model.encode(spoiler_texts, convert_to_tensor=True, show_progress_bar=True)
non_spoiler_embs = model.encode(non_spoiler_texts, convert_to_tensor=True, show_progress_bar=True)

#compute mean vectors (anchors)
spoiler_anchor = spoiler_embs.mean(dim=0, keepdim=True)
non_spoiler_anchor = non_spoiler_embs.mean(dim=0, keepdim=True)

#save to disk
torch.save(spoiler_anchor, "/content/spoiler_anchor.pt")
torch.save(non_spoiler_anchor, "/content/non_spoiler_anchor.pt")

print("Spoiler and Non-Spoiler anchors saved successfully.")


# **Step 5: Model Evaluation**

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import random

#load model
model = SentenceTransformer("spoiler-shield-contrastive-model")

#load preprocessed data
df = pd.read_csv("spoiler_shield_cleaned.csv")

#create balanced test set
spoiler_samples = df[df['Comment Type'] == 'Spoiler'].sample(n=300, random_state=42)
non_spoiler_samples = df[df['Comment Type'] == 'Non-Spoiler'].sample(n=300, random_state=42)

#construct test pairs
test_texts1, test_texts2, labels = [], [], []

for i in range(300):
    #positive: spoiler-spoiler
    test_texts1.append(spoiler_samples.iloc[i]['Cleaned Comment'])
    test_texts2.append(spoiler_samples.iloc[(i+1) % 300]['Cleaned Comment'])
    labels.append(1)

    #positive: non-spoiler–non-spoiler
    test_texts1.append(non_spoiler_samples.iloc[i]['Cleaned Comment'])
    test_texts2.append(non_spoiler_samples.iloc[(i+1) % 300]['Cleaned Comment'])
    labels.append(1)

    #negative: spoiler–non-spoiler
    test_texts1.append(spoiler_samples.iloc[i]['Cleaned Comment'])
    test_texts2.append(non_spoiler_samples.iloc[i]['Cleaned Comment'])
    labels.append(0)

#encode pairs
embeddings1 = model.encode(test_texts1, convert_to_tensor=True, batch_size=32)
embeddings2 = model.encode(test_texts2, convert_to_tensor=True, batch_size=32)

#compute cosine similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
cosine_scores = cosine_scores.diagonal().cpu().numpy()

#convert similarity to binary predictions (threshold = 0.5)
preds = [1 if score >= 0.5 else 0 for score in cosine_scores]

#print metrics
print("Evaluation Metrics")
print("Accuracy :", accuracy_score(labels, preds))
print("Precision:", precision_score(labels, preds))
print("Recall   :", recall_score(labels, preds))
print("F1-score :", f1_score(labels, preds))


# **Step 6: Real-Time Spoiler Detector (Python Function)**

In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

#ensure NLTK stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#load trained model
model = SentenceTransformer('spoiler-shield-contrastive-model')

#load cleaned dataset for anchor embeddings
df = pd.read_csv("spoiler_shield_cleaned.csv")

#get balanced anchors
spoiler_anchors = df[df["Comment Type"] == "Spoiler"]["Cleaned Comment"].sample(n=100, random_state=42).tolist()
non_spoiler_anchors = df[df["Comment Type"] == "Non-Spoiler"]["Cleaned Comment"].sample(n=100, random_state=42).tolist()

spoiler_embeddings = model.encode(spoiler_anchors, convert_to_tensor=True)
non_spoiler_embeddings = model.encode(non_spoiler_anchors, convert_to_tensor=True)

#preprocessing function
def clean_comment(text):
    text = re.sub(r'>!(.*?)!<', r'\1', text)
    text = re.sub(r'http\S+|www\S+|[\*\[\]\(\)\{\}]|[\n\r]', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

#spoiler prediction function
def predict_spoiler(user_input):
    cleaned = clean_comment(user_input)
    input_embedding = model.encode(cleaned, convert_to_tensor=True)

    spoiler_score = util.cos_sim(input_embedding, spoiler_embeddings).mean()
    non_spoiler_score = util.cos_sim(input_embedding, non_spoiler_embeddings).mean()

    print(f"Spoiler score: {spoiler_score:.4f}")
    print(f"Non-spoiler score: {non_spoiler_score:.4f}")

    if spoiler_score > non_spoiler_score:
        return "Predicted: **Spoiler**"
    else:
        return "Predicted: **Non-Spoiler**"


In [None]:
comment = "I like the movie director"
print(predict_spoiler(comment))

# **Step 7: Deployment**

### **Step 7.1: Prepare Your Colab Environment**

Install Streamlit & pyngrok

In [None]:
!pip install streamlit pyngrok --quiet

### **Step 7.2: Save Your Streamlit App Script**

Create a new file called app.py:

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import re
import string
import nltk
from nltk.corpus import stopwords

#download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#helper function for safe sampling
def safe_sample(df, n):
    n = min(n, len(df))
    if n > 0:
        return df.sample(n=n)
    else:
        st.warning("Cannot sample from empty dataframe")
        return pd.DataFrame()

#load dataset
df = pd.read_csv("spoiler_shield_cleaned.csv")

#debug prints for dataset info
st.write("Dataset loaded with shape:", df.shape)

#filter spoiler/non-spoiler DataFrames
non_spoiler_df = df[df["Comment Type"].str.lower() == "non-spoiler"]
spoiler_df = df[df["Comment Type"].str.lower() == "spoiler"]

st.write("Spoiler comments count:", len(spoiler_df))
st.write("Non-spoiler comments count:", len(non_spoiler_df))

#check empty
if spoiler_df.empty:
    st.error("No spoiler comments found in the dataset.")
    st.stop()

if non_spoiler_df.empty:
    st.error("No non-spoiler comments found in the dataset.")
    st.stop()

#Optionally safely sample some anchors if needed (otherwise just get all)
#spoiler_sample_df = safe_sample(spoiler_df, 1000)
#non_spoiler_sample_df = safe_sample(non_spoiler_df, 1000)

#but if you want all data as anchors:
spoiler_anchors = spoiler_df["Cleaned Comment"].dropna().tolist()
non_spoiler_anchors = non_spoiler_df["Cleaned Comment"].dropna().tolist()

if not spoiler_anchors:
    st.error("Cleaned spoiler comments list is empty.")
    st.stop()

if not non_spoiler_anchors:
    st.error("Cleaned non-spoiler comments list is empty.")
    st.stop()

#load model once
model = SentenceTransformer("spoiler-shield-contrastive-model")

#encode embeddings once
spoiler_embeddings = model.encode(spoiler_anchors, convert_to_tensor=True)
non_spoiler_embeddings = model.encode(non_spoiler_anchors, convert_to_tensor=True)

#text cleaning function
def clean_text(text):
    text = re.sub(r'>!(.*?)!<', r'\1', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

#spoiler prediction function
def predict_spoiler(comment):
    cleaned = clean_text(comment)
    embedding = model.encode(cleaned, convert_to_tensor=True)
    spoiler_sim = util.cos_sim(embedding, spoiler_embeddings).mean().item()
    non_spoiler_sim = util.cos_sim(embedding, non_spoiler_embeddings).mean().item()
    label = "Spoiler" if spoiler_sim > non_spoiler_sim else "Non-Spoiler"
    return label, spoiler_sim, non_spoiler_sim

#streamlit UI
st.title("Spoiler Shield with NLP")
st.markdown("Enter a comment below to detect if it contains spoilers.")

user_input = st.text_area("Enter a comment:")

if st.button("Predict"):
    if user_input.strip() == "":
        st.warning("Please enter a comment first.")
    else:
        label, sim_spoiler, sim_nonspoiler = predict_spoiler(user_input)
        st.markdown(f"### Prediction: `{label}`")
        st.write(f"**Spoiler Similarity:** {sim_spoiler:.4f}")
        st.write(f"**Non-Spoiler Similarity:** {sim_nonspoiler:.4f}")


In [None]:
from google.colab import files
uploaded = files.upload()


## **Step 7.3: Launch the Streamlit App with pyngrok**

Now let's create a public demo link:

In [None]:
import os
import time
from pyngrok import ngrok

#kill any existing tunnels
ngrok.kill()

ngrok.set_auth_token("2xb2VrWhxsTBH3RWm5MgJp2zTX3_NDMTjcMX8zci6qp6d8yU")
#start Streamlit in the background
os.system("streamlit run app.py &")

#give Streamlit time to start
time.sleep(5)

#connect ngrok to the default Streamlit port (8501)
public_url = ngrok.connect("http://localhost:8501", bind_tls=True)
print(f"Your Streamlit app is live at: {public_url}")

In [None]:
#!streamlit run app.py &

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
#!zip -r spoiler-shield-nlp.zip /content/spoiler-shield-nlp/spoiler-shield-nlp

In [None]:
#from google.colab import files
#files.download('spoiler-shield-nlp.zip')
