In [None]:
import kagglehub
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Datensatz herunterladen

In [None]:
path_to_dataset = kagglehub.dataset_download("urbanbricks/wikipedia-promotional-articles")
print(path_to_dataset)

## Dateien lesen

In [None]:
frac=0.05
random_state=42

good_df = pd.read_csv(os.path.join(path_to_dataset, 'good.csv')).sample(frac=frac, random_state=random_state)
promo_df = pd.read_csv(os.path.join(path_to_dataset, 'promotional.csv')).sample(frac=frac, random_state=random_state)
print("Good:\n")
print(good_df.info())
print("\nPromo:\n")
print(promo_df.info())

## Daten kombinieren

In [None]:
good_df['label'] = 'good'
promo_df['label'] = 'promotional'
df = pd.concat([good_df, promo_df])
df = df[['text', 'label']]
print(df.info())

## Textdaten in numerische Merkmale umwandeln

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X = tfidf.fit_transform(df['text'])
y = df['label']

## Train-Test-Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Hyperparameter-Tuning (Grid Search)

In [None]:
def tune_grid_search(X_train, y_train, model):
    param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000]}
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_C = grid_search.best_params_['C']
    best_score = grid_search.best_score_
    print(f"Best param: {best_C}")
    print(f"Best score: {best_score}")
    return grid_search.best_estimator_

## Evaluations-Methode

In [None]:
def evaluate_model(X_test, y_test, model):
    y_pred = model.predict(X_test)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nAccuracy:")
    print(accuracy_score(y_test, y_pred))

## Linearer Kernel

In [None]:
# svm_model_linear = SVC(kernel='linear', C=1.0)
# svm_model_linear.fit(X_train, y_train)

# frac=0.0 => C=1
# frac=0.05 => C=10

svm_model_linear = tune_grid_search(X_train, y_train, SVC(kernel='linear'))

In [None]:
evaluate_model(X_test, y_test, svm_model_linear)

## RBF-Kernel

In [None]:
# svm_model_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')
# svm_model_rbf.fit(X_train, y_train)

# frac=0.01 => C=1
# frac=0.05 => C=10

svm_model_rbf = tune_grid_search(X_train, y_train, SVC(kernel='rbf', gamma='scale'))

In [None]:
evaluate_model(X_test, y_test, svm_model_rbf)

## Poly-Kernel

In [None]:
# svm_model_poly = SVC(kernel='poly', C=1.0, gamma='scale')
# svm_model_poly.fit(X_train, y_train)

# frac=0.01 => C=1
# frac=0.05 => C=10

svm_model_poly = tune_grid_search(X_train, y_train, SVC(kernel='poly', gamma='scale'))

In [None]:
evaluate_model(X_test, y_test, svm_model_poly)

## Sigmoid-Kernel

In [None]:
# svm_model_sigmoid = SVC(kernel='sigmoid', C=1.0, gamma='scale')
# svm_model_sigmoid.fit(X_train, y_train)

# frac=0.01 => C=1
# frac=0.05 => C=1

svm_model_sigmoid = tune_grid_search(X_train, y_train, SVC(kernel='sigmoid', gamma='scale'))

In [None]:
evaluate_model(X_test, y_test, svm_model_sigmoid)