In [None]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score


# Load cleaned data
df = pd.read_csv("test_cleaned_file.csv")

# Convert stringified lists back to lists
df['content'] = df['content'].apply(ast.literal_eval)

# Filter and map 'type' values into binary classification
accepted_types = ['fake', 'satire',  'bias', 'conspiracy', 'clickbait','reliable', 'political']
df = df[df['type'].notna()]
df = df[df['type'].str.lower().isin(accepted_types)]

def map_type(x):
    x = x.lower()
    if x in ['fake', 'satire' ,'conspiracy', 'bias']:
        return 0
    elif x in ['reliable', 'political', 'clickbait']:
        return 1
    else:
        return None

# Create label column
df['label'] = df['type'].apply(map_type)

# Remove rows with empty tokens
df = df[df['content'].map(len) > 0]

# Split data
X = df['content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define TF-IDF vectorizer for tokenized input
tfidf = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    max_df=0.60,
    min_df=5,
    ngram_range=(1, 2)
)

# === SVM Pipeline ===
svm_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', SVC(kernel='linear', C=1.0, probability=True))
])

svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)

print("=== SVM Model Results ===")
print(classification_report(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))


=== SVM Model Results ===
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      1578
           1       0.86      0.88      0.87      1680

    accuracy                           0.86      3258
   macro avg       0.86      0.86      0.86      3258
weighted avg       0.86      0.86      0.86      3258

Recall: 0.875
Accuracy: 0.8612645794966237
F1 Score: 0.8667452830188679


In [5]:
from sklearn.model_selection import GridSearchCV

# === SVM Pipeline ===
svm_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', SVC(kernel='linear', probability=True))
])

# === Grid Search parametre ===
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__class_weight': ['balanced'],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__min_df': [5]
}

# === Grid SearchCV ===
grid = GridSearchCV(
    svm_pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit grid search
grid.fit(X_train, y_train)

# Brug den bedste model
best_model = grid.best_estimator_
y_pred_svm = best_model.predict(X_test)

# Evaluer
print("=== SVM Model with GridSearchCV ===")
print("Best parameters:", grid.best_params_)
print("F1 Score (CV-best):", grid.best_score_)
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
=== SVM Model with GridSearchCV ===
Best parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 3)}
F1 Score (CV-best): 0.8536424625395853
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      1578
           1       0.86      0.87      0.87      1680

    accuracy                           0.86      3258
   macro avg       0.86      0.86      0.86      3258
weighted avg       0.86      0.86      0.86      3258

Accuracy: 0.8634131368937998
F1 Score: 0.8682262363044122
