In [94]:
# explicit_content_classifier.py

import pandas as pd
import re
import string
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Download required resources
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [120]:
from datasets import load_dataset

ds = load_dataset("valurank/Explicit_content")

In [126]:
df = ds['train'].to_pandas()

In [138]:
df.loc[df['Category'] == 'Explicit', 'Category'] = 1
df.loc[df['Category'] == 'Not_Explicit', 'Category'] = 0
df['Category'] = df['Category'].astype(int)

df

Unnamed: 0,Article,Category,clean_text
0,"MADISON, Wis. -- A judge sentenced a former Wi...",1,madison wis judge sentence former wisconsin ...
1,Quick-thinking bystandders in southern Califor...,1,quickthinke bystandder southern california hel...
2,Tulsi Gabbard pushes back against Jack White’s...,1,tulsi gabbard push back jack white antitrump c...
3,WASHINGTON — President Joe Biden will establis...,1,washington president joe biden establish nat...
4,"Ecuador’s president, Guillermo Lasso, has decl...",1,ecuadors president guillermo lasso declare day...
...,...,...,...
1184,"ANAHEIM, Calif. -- Miguel Rojas doubled and sc...",0,anaheim calif miguel rojas double score mich...
1185,"Last year, it might have been Mitch Haninger, ...",0,last year might mitch haninger mariner long te...
1186,At least two of the fake Republican electors w...,0,least two fake republican elector instal forme...
1187,IPFS is a decentralized network that makes it ...,0,ipfs decentralized network make possible effic...


In [128]:
# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and token.text not in string.punctuation]
    return ' '.join(tokens)

In [139]:
# Load your dataset (replace with your actual file)
#df = pd.read_csv("sample_1000.csv")
print(df.size)
df.drop_duplicates(subset=['Article'], inplace=True)
print(df.size)
df['clean_text'] = df['Article'].apply(preprocess)

3216
3216


In [140]:
df.size

3216

In [146]:
print(df['Category'].value_counts())

Category
0    757
1    315
Name: count, dtype: int64


In [147]:
# Feature extraction with TF-IDF (removed min_df/max_df to avoid conflicts)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # default min_df=1, max_df=1.0
X = vectorizer.fit_transform(df['clean_text'])
y = df['Category']

In [148]:
y

0       1
1       1
2       1
3       1
4       1
       ..
1184    0
1185    0
1186    0
1187    0
1188    0
Name: Category, Length: 1072, dtype: int64

In [149]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM model
clf = SVC(kernel='linear', C=100)
clf.fit(X_train, y_train)

In [162]:
# Evaluate the model
y_pred = clf.predict(X_test)
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['positive', 'negative']))

# Prediction function for new text
def detect_explicit(text):
    processed = preprocess(text)
    vect = vectorizer.transform([processed])
    prediction = clf.predict(vect)[0]

    return "\U0001F6A9 Positive" if prediction == 1 else "\u2705 Negative"

# Sample predictions
print("\n--- Sample Predictions ---")
sample1 = "She trembled as he gently ran his fingers across her skin under the candlelight."
sample2 = "IT was a fun fun night"
print(f"Text: {sample1}\nPrediction: {detect_explicit(sample1)}\n")
print(f"Text: {sample2}\nPrediction: {detect_explicit(sample2)}")




--- Classification Report ---
              precision    recall  f1-score   support

    positive       0.86      0.96      0.91       151
    negative       0.87      0.62      0.73        64

    accuracy                           0.86       215
   macro avg       0.86      0.79      0.82       215
weighted avg       0.86      0.86      0.85       215


--- Sample Predictions ---
Text: She trembled as he gently ran his fingers across her skin under the candlelight.
Prediction: ✅ Negative

Text: IT was a fun fun night
Prediction: ✅ Negative


In [171]:
# Prediction function for new text
def detect_explicit(text):
    if not text.strip():
        return "⚠️ Empty input"

    processed = preprocess(text)
    vect = vectorizer.transform([processed])
    prediction = clf.predict(vect)[0]

    return "🚩 Explicit" if prediction == 1 else "✅ Not Explicit"

# Sample predictions
print("\n--- Sample Predictions ---")
samples = [
    "She trembled as he gently ran his fingers across her skin under the candlelight.",
    "IT was a fun fun night because we were all having sex with each other",
    "I love reading books at the beachand having sex.",
    ""
]

for idx, text in enumerate(samples, 1):
    print(f"Sample {idx}:")
    print(f"Text      : {text}")
    print(f"Prediction: {detect_explicit(text)}")
    print("-" * 60)



--- Sample Predictions ---
Sample 1:
Text      : She trembled as he gently ran his fingers across her skin under the candlelight.
Prediction: ✅ Not Explicit
------------------------------------------------------------
Sample 2:
Text      : IT was a fun fun night because we were all having sex with each other
Prediction: 🚩 Explicit
------------------------------------------------------------
Sample 3:
Text      : I love reading books at the beachand having sex.
Prediction: 🚩 Explicit
------------------------------------------------------------
Sample 4:
Text      : 
Prediction: ⚠️ Empty input
------------------------------------------------------------


In [172]:
df.size

3216

In [158]:
# Recreate the test texts (since you only have vectorized versions in X_test)
# Get corresponding raw articles for X_test
X_texts = df['Article'].tolist()
clean_texts = df['clean_text'].tolist()
y_all = df['Category'].tolist()

# First split raw clean text and labels just like X, y
X_train_texts, X_test_texts, y_train_list, y_test_list = train_test_split(
    X_texts, y_all, test_size=0.2, random_state=42
)

# Print predictions one by one
for i in range(len(X_test_texts)):
    print(f"Sample {i+1}")
    print("Original Article :", X_test_texts[i])
    print("True Label       :", y_test_list[i])
    print("Predicted Label  :", y_pred[i])
    print("-" * 60)


Sample 1
Original Article : 2023 Women's World Cup odds: Bettors cash in on New Zealand upsetting Norway
It didn’t take long for an upset to register in the Women’s World Cup odds market. In fact, it took only two matches to get a surprise result, coming on the 32-team tournament’s first day.
Adam Pullen, assistant director of trading for Caesars Sports, and BetMGM sports trader Seamus Magee, discuss how the Day 1 upset impacted World Cup betting.
Underdog day
Norway entered the Group A match as a -250 favorite (bet $10 to win $14 total) on BetMGM’s three-way moneyline, on which bettors can wager on either team to win in 90 minutes plus injury time or wager on the game ending in a draw. On the flip side, New Zealand was a sizable +650 underdog (bet $10 to win $75 total).
"It was quite a shock to start the tournament," Magee said. "It feels like upsets of this magnitude in the tournament are few and far between."
The first half brought no scoring, but it didn’t take long into the second