In [2]:
# Imports
import requests
import json
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
import multiprocessing
import numpy as np
import torch
from torch_geometric.utils.convert import from_networkx
import networkx as nx
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to get BERT embeddings
def get_bert_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings from the last hidden state and average them
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embeddings

# Paths to data files
pos_path = '/path/to/incel_users.txt'
neg_path = '/path/to/non_incel_users.txt'

# Load and process data
incel_texts = []
non_incel_texts = []

# Function to read users from files
def read_users(filepath, pos=True):
    with open(filepath, 'r') as file:
        current_user = ""
        for line in file:
            line = line.strip()
            if '~: ' in line:
                if current_user:
                    if pos:
                        incel_texts.append(current_user)
                    else:
                        non_incel_texts.append(current_user)
                    current_user = ""
                current_user = line
            else:
                current_user += " " + line
        if current_user:
            if pos:
                incel_texts.append(current_user)
            else:
                non_incel_texts.append(current_user)

read_users(pos_path, pos=True)
read_users(neg_path, pos=False)

# Generate embeddings
incel_embeddings = [get_bert_embedding(text, bert_model, tokenizer) for text in incel_texts]
non_incel_embeddings = [get_bert_embedding(text, bert_model, tokenizer) for text in non_incel_texts]

# Create labels
incel_labels = [1] * len(incel_embeddings)
non_incel_labels = [0] * len(non_incel_embeddings)

# Combine data
X = np.vstack((incel_embeddings, non_incel_embeddings))
y = np.array(incel_labels + non_incel_labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define ML models
def ML_model(model='Regression'):
    if model == 'Regression':
        model = LogisticRegression(max_iter=1000)
    elif model == 'SVM':
        model = SVC(kernel='linear', C=0.1, gamma=1, probability=True)
    elif model == 'DecisionTree':
        model = DecisionTreeClassifier()
    elif model == 'RandomForest':
        model = RandomForestClassifier(n_estimators=150, max_depth=20, min_samples_split=2, min_samples_leaf=2)
    elif model == 'KNN':
        model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_scaled, y_train)
    return model

# Train and evaluate models
all_models = ['Regression', 'SVM', 'DecisionTree', 'KNN', 'RandomForest']
for model_name in all_models:
    model = ML_model(model=model_name)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"Model: {model_name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")
    print(f"Confusion Matrix for {model_name}:\n{confusion_matrix(y_test, y_pred)}")


  from .autonotebook import tqdm as notebook_tqdm


Device: cpu


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 