In [3]:
import pandas as pd
import ast
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Parameters
CHUNKSIZE = 100000  # Tune this if needed
FILE = "cleaned_file.csv"

# Step 1: Prepare TF-IDF vocabulary on a small sample
sample_df = pd.read_csv(FILE, nrows=50000)
sample_df['content'] = sample_df['content'].apply(ast.literal_eval)
sample_df['text'] = sample_df['content'].apply(lambda x: " ".join(x))

# Filter and label
accepted_types = ['fake', 'satire', 'bias', 'conspiracy', 'clickbait', 'reliable', 'political']
sample_df = sample_df[sample_df['type'].notna()]
sample_df = sample_df[sample_df['type'].str.lower().isin(accepted_types)]

def map_type(x):
    x = x.lower()
    return 0 if x in ['fake', 'satire', 'conspiracy', 'bias'] else 1

sample_df['label'] = sample_df['type'].apply(map_type)

# Fit vectorizer on sample
vectorizer = TfidfVectorizer(
    max_features=250000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.6
)
vectorizer.fit(sample_df['text'])

# Step 2: Train SVM (SGDClassifier) in chunks
model = SGDClassifier(loss='hinge', max_iter=5)

first_batch = True

reader = pd.read_csv(FILE, chunksize=CHUNKSIZE)
for i, chunk in enumerate(reader):
    print(f"Processing chunk {i + 1}")
    chunk = chunk[chunk['type'].notna()]
    chunk = chunk[chunk['type'].str.lower().isin(accepted_types)]
    chunk['content'] = chunk['content'].apply(ast.literal_eval)
    chunk['text'] = chunk['content'].apply(lambda x: " ".join(x))
    chunk['label'] = chunk['type'].apply(map_type)

    X = vectorizer.transform(chunk['text'])
    y = chunk['label']

    if first_batch:
        model.partial_fit(X, y, classes=np.array([0, 1]))
        first_batch = False
    else:
        model.partial_fit(X, y)

print("✅ Finished training on all chunks")

# Step 3: Final evaluation on a held-out test set
# Sample again for final test
eval_df = pd.read_csv(FILE, skiprows=range(1, 800001), nrows=50000)
eval_df = eval_df[eval_df['type'].notna()]
eval_df = eval_df[eval_df['type'].str.lower().isin(accepted_types)]
eval_df['content'] = eval_df['content'].apply(ast.literal_eval)
eval_df['text'] = eval_df['content'].apply(lambda x: " ".join(x))
eval_df['label'] = eval_df['type'].apply(map_type)

X_test = vectorizer.transform(eval_df['text'])
y_test = eval_df['label']
y_pred = model.predict(X_test)

print("=== Chunkwise SVM Results ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


  sample_df = pd.read_csv(FILE, nrows=50000)
  for i, chunk in enumerate(reader):


Processing chunk 1


  for i, chunk in enumerate(reader):


Processing chunk 2
Processing chunk 3


  for i, chunk in enumerate(reader):


Processing chunk 4


  for i, chunk in enumerate(reader):


Processing chunk 5


  for i, chunk in enumerate(reader):


Processing chunk 6


  for i, chunk in enumerate(reader):


Processing chunk 7
Processing chunk 8


  for i, chunk in enumerate(reader):


Processing chunk 9


  for i, chunk in enumerate(reader):


Processing chunk 10
✅ Finished training on all chunks


  eval_df = pd.read_csv(FILE, skiprows=range(1, 800001), nrows=50000)


=== Chunkwise SVM Results ===
              precision    recall  f1-score   support

           0       0.88      0.79      0.84     15551
           1       0.86      0.92      0.89     21022

    accuracy                           0.87     36573
   macro avg       0.87      0.86      0.86     36573
weighted avg       0.87      0.87      0.87     36573

Accuracy: 0.868482213654882
Recall: 0.9228427361811435
F1 Score: 0.8897041962852557


In [None]:
import pandas as pd
import ast
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score

# Filnavne
TRAIN_FILE = "X_train.csv"
TEST_FILE = "X_test.csv"
# VAL_FILE = "val.csv"  # Ikke brugt i dette script

CHUNKSIZE = 100000  # Tilpas hvis nødvendigt

# Hjælpefunktion
def map_type(x):
    x = x.lower()
    return 0 if x in ['fake', 'satire', 'conspiracy', 'bias'] else 1

accepted_types = ['fake', 'satire', 'bias', 'conspiracy', 'clickbait', 'reliable', 'political']

# --- 1. Fit TF-IDF på et lille sample ---
sample_df = pd.read_csv(TRAIN_FILE, nrows=100000)
sample_df = sample_df[sample_df['type'].notna()]
sample_df = sample_df[sample_df['type'].str.lower().isin(accepted_types)]
sample_df['content'] = sample_df['content'].apply(ast.literal_eval)
sample_df['text'] = sample_df['content'].apply(lambda x: " ".join(x))

vectorizer = TfidfVectorizer(
    max_features=250000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.6
)
vectorizer.fit(sample_df['text'])

# --- 2. Træn SVM (SGD) i chunks ---
model = SGDClassifier(loss='hinge', max_iter=5, random_state=42)
first_batch = True

reader = pd.read_csv(TRAIN_FILE, chunksize=CHUNKSIZE)

for i, chunk in enumerate(reader):
    print(f"✅ Træner på chunk {i + 1}")
    chunk = chunk[chunk['type'].notna()]
    chunk = chunk[chunk['type'].str.lower().isin(accepted_types)]
    chunk['content'] = chunk['content'].apply(ast.literal_eval)
    chunk['text'] = chunk['content'].apply(lambda x: " ".join(x))
    chunk['label'] = chunk['type'].apply(map_type)

    X = vectorizer.transform(chunk['text'])
    y = chunk['label']

    if first_batch:
        model.partial_fit(X, y, classes=np.array([0, 1]))
        first_batch = False
    else:
        model.partial_fit(X, y)

print("Træning færdig!")

# --- 3. Test modellen på hele test.csv ---
test_df = pd.read_csv(TEST_FILE)
test_df = test_df[test_df['type'].notna()]
test_df = test_df[test_df['type'].str.lower().isin(accepted_types)]
test_df['content'] = test_df['content'].apply(ast.literal_eval)
test_df['text'] = test_df['content'].apply(lambda x: " ".join(x))
test_df['label'] = test_df['type'].apply(map_type)

X_test = vectorizer.transform(test_df['text'])
y_test = test_df['label']
y_pred = model.predict(X_test)

# --- 4. Evaluering ---
print("=== Evaluering på test.csv ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))



  sample_df = pd.read_csv(TRAIN_FILE, nrows=100000)
