In [5]:
import requests
import pandas as pd
import time
import random

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
]

scrape_configs = [
    {'sort': 'top', 't': 'all'},      
    {'sort': 'top', 't': 'year'},
    {'sort': 'controversial', 't': 'all'}
]

posts_collected = {}


for config in scrape_configs:
    sort_type = config['sort']
    time_filter = config['t']
    url = f"https://old.reddit.com/r/AmItheAsshole/{sort_type}.json"
    
    after_token = None
    
    for i in range(15): 
        current_agent = random.choice(user_agents)
        headers = {'User-Agent': current_agent}
        params = {'limit': 100, 't': time_filter, 'after': after_token}
        
        try:
            response = requests.get(url, headers=headers, params=params)
            
            if response.status_code != 200:
                time.sleep(5)
                continue
            
            data = response.json()
            children = data['data']['children']
            
            if not children:
                break
                
            for post in children:
                p = post['data']
                pid = p.get('id')
                flair = p.get('link_flair_text')
                
                valid_flairs = ["Not the A-hole", "Asshole", "Everyone Sucks", "No A-holes here"]
                
                if flair in valid_flairs:
                    full_text = f"{p.get('title')} {p.get('selftext')}"
                    
                    if "[removed]" not in full_text and "[deleted]" not in full_text and len(full_text) > 50:
                        posts_collected[pid] = {
                            "text": full_text,
                            "label": flair,
                            "id": pid
                        }
            
            after_token = data['data']['after']
            if not after_token:
                break
            
            time.sleep(random.uniform(2, 5))
            
        except Exception:
            break

df = pd.DataFrame(list(posts_collected.values()))
df = df.dropna(subset=['text', 'label'])
df.to_csv("aita_clean_dataset.csv", index=False)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv("aita_clean_dataset.csv")
df = df.dropna(subset=['text', 'label'])
df = df[df['label'] != "No A-holes here"]

binary_map = {
    "Not the A-hole": 0,
    "Asshole": 1,
    "Everyone Sucks": 1 
}
df['label_id'] = df['label'].map(binary_map)
df = df.dropna(subset=['label_id'])

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['label_id'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label_id']
)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Naive Bayes")
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
nb_pred = nb.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.4f}")
print(classification_report(y_test, nb_pred))

print("\nLogistic Regression")
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train_vec, y_train)
lr_pred = lr.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.4f}")
print(classification_report(y_test, lr_pred))

Naive Bayes
Accuracy: 0.5694
              precision    recall  f1-score   support

           0       0.56      0.99      0.72        79
           1       0.80      0.06      0.11        65

    accuracy                           0.57       144
   macro avg       0.68      0.52      0.41       144
weighted avg       0.67      0.57      0.44       144


Logistic Regression
Accuracy: 0.6111
              precision    recall  f1-score   support

           0       0.64      0.67      0.65        79
           1       0.57      0.54      0.56        65

    accuracy                           0.61       144
   macro avg       0.61      0.60      0.60       144
weighted avg       0.61      0.61      0.61       144



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

filename = "aita_clean_dataset.csv"
df = pd.read_csv(filename)
df = df.dropna(subset=['text', 'label'])
df = df[~df['text'].str.contains(r"\[removed\]", na=False)]
df = df[~df['text'].str.contains(r"\[deleted\]", na=False)]
df = df[df['text'].str.len() > 50]

df = df[df['label'] != "No A-holes here"]

binary_map = {
    "Not the A-hole": 0,
    "Asshole": 1,
    "Everyone Sucks": 1 
}
df['label_id'] = df['label'].map(binary_map)

df = df.dropna(subset=['label_id'])
df['label_id'] = df['label_id'].astype(int)


train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)

train_dataset = Dataset.from_pandas(train_df[['text', 'label_id']].rename(columns={'label_id': 'label'}))
val_dataset = Dataset.from_pandas(val_df[['text', 'label_id']].rename(columns={'label_id': 'label'}))

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

training_args = TrainingArguments(
    output_dir='./results_final',
    num_train_epochs=4,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
)

trainer.train()


results = trainer.evaluate()
print(f"Final Accuracy: {results['eval_accuracy']:.4f}")

predictions = trainer.predict(val_tokenized)
preds = np.argmax(predictions.predictions, axis=-1)
target_names = ["Not Asshole (0)", "Asshole (1)"]
print(classification_report(val_tokenized['label'], preds, target_names=target_names))

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.67522,0.548611
2,No log,0.613498,0.680556
3,No log,0.609571,0.673611
4,No log,0.631327,0.680556




Final Accuracy: 0.6806




                 precision    recall  f1-score   support

Not Asshole (0)       0.73      0.67      0.70        79
    Asshole (1)       0.63      0.69      0.66        65

       accuracy                           0.68       144
      macro avg       0.68      0.68      0.68       144
   weighted avg       0.68      0.68      0.68       144

