In [8]:
import pandas as pd
import ast
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score

# Filnavne
TRAIN_FILE = "X_train.csv"
TEST_FILE = "X_test.csv"
# VAL_FILE = "val.csv"  # Ikke brugt i dette script

CHUNKSIZE = 100000  # Tilpas hvis nødvendigt

# Hjælpefunktion
def map_type(x):
    x = x.lower()
    return 0 if x in ['fake', 'satire', 'conspiracy', 'bias'] else 1

accepted_types = ['fake', 'satire', 'bias', 'conspiracy', 'clickbait', 'reliable', 'political']

# --- 1. Fit TF-IDF på et lille sample ---
sample_df = pd.read_csv(TRAIN_FILE, nrows=100000)
sample_df = sample_df[sample_df['type'].notna()]
sample_df = sample_df[sample_df['type'].str.lower().isin(accepted_types)]
sample_df['content'] = sample_df['content'].apply(ast.literal_eval)
sample_df['text'] = sample_df['content'].apply(lambda x: " ".join(x))

vectorizer = TfidfVectorizer(
    max_features=1000000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.6
)
vectorizer.fit(sample_df['text'])

# --- 2. Træn SVM (SGD) i chunks ---
model = SGDClassifier(loss='hinge', max_iter=5, random_state=42)
first_batch = True

reader = pd.read_csv(TRAIN_FILE, chunksize=CHUNKSIZE)

for i, chunk in enumerate(reader):
    print(f"✅ Træner på chunk {i + 1}")
    chunk = chunk[chunk['type'].notna()]
    chunk = chunk[chunk['type'].str.lower().isin(accepted_types)]
    chunk['content'] = chunk['content'].apply(ast.literal_eval)
    chunk['text'] = chunk['content'].apply(lambda x: " ".join(x))
    chunk['label'] = chunk['type'].apply(map_type)

    X = vectorizer.transform(chunk['text'])
    y = chunk['label']

    if first_batch:
        model.partial_fit(X, y, classes=np.array([0, 1]))
        first_batch = False
    else:
        model.partial_fit(X, y)

print("Træning færdig!")

# --- 3. Test modellen på hele test.csv ---
test_df = pd.read_csv(TEST_FILE)
test_df = test_df[test_df['type'].notna()]
test_df = test_df[test_df['type'].str.lower().isin(accepted_types)]
test_df['content'] = test_df['content'].apply(ast.literal_eval)
test_df['text'] = test_df['content'].apply(lambda x: " ".join(x))
test_df['label'] = test_df['type'].apply(map_type)

X_test = vectorizer.transform(test_df['text'])
y_test = test_df['label']
y_pred = model.predict(X_test)

# --- 4. Evaluering ---
print("=== Evaluering på test.csv ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))



  sample_df = pd.read_csv(TRAIN_FILE, nrows=100000)
  for i, chunk in enumerate(reader):


✅ Træner på chunk 1


  for i, chunk in enumerate(reader):


✅ Træner på chunk 2


  for i, chunk in enumerate(reader):


✅ Træner på chunk 3


  for i, chunk in enumerate(reader):


✅ Træner på chunk 4


  for i, chunk in enumerate(reader):


✅ Træner på chunk 5
✅ Træner på chunk 6


  for i, chunk in enumerate(reader):


✅ Træner på chunk 7


  for i, chunk in enumerate(reader):


✅ Træner på chunk 8
Træning færdig!


  test_df = pd.read_csv(TEST_FILE)


=== Evaluering på test.csv ===
              precision    recall  f1-score   support

           0       0.89      0.79      0.84     35012
           1       0.85      0.92      0.88     44010

    accuracy                           0.86     79022
   macro avg       0.87      0.86      0.86     79022
weighted avg       0.87      0.86      0.86     79022

Accuracy: 0.863114069499633
Recall: 0.9200636219041127
F1 Score: 0.882169039552946
