In [1]:
# Standard libraries
import re
import sys
import warnings
import pickle

# Data manipulation and visualization
import pandas as pd
import numpy as np

# Text processing
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine learning models and metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)

# Hyperparameter tuning
import optuna

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
sys.path.append("..")

# Local modules
from utilities.download_dataset import download_dataset
from utilities.update_model_metrics import update_model_metrics 
from utilities.test_process import text_process


In [2]:
BAYESIAN_OPTIMIZATION = False

In [3]:
download_dataset()

Downloading ../dataset/train.jsonl...


Downloading...
From (original): https://drive.google.com/uc?id=1atKYBZ9Pq3-zT0_BC_nZqMx0W-wjEW01
From (redirected): https://drive.google.com/uc?id=1atKYBZ9Pq3-zT0_BC_nZqMx0W-wjEW01&confirm=t&uuid=f4ceac3f-70ac-4a5b-80e2-94c81448180f
To: /Users/wojciech.neuman/Documents/ai-content-detectors/dataset/train.jsonl
100%|██████████| 292M/292M [00:03<00:00, 81.9MB/s] 


Downloading ../dataset/valid.jsonl...


Downloading...
From: https://drive.google.com/uc?id=1FhT3m_ApKzX615JzshB5-d-j6S91-6oz
To: /Users/wojciech.neuman/Documents/ai-content-detectors/dataset/valid.jsonl
100%|██████████| 55.1M/55.1M [00:02<00:00, 27.0MB/s]


Downloading ../dataset/test.jsonl...


Downloading...
From: https://drive.google.com/uc?id=16p0td9GgJRb9AP8i4HlX-xZGI2u849uA
To: /Users/wojciech.neuman/Documents/ai-content-detectors/dataset/test.jsonl
100%|██████████| 39.1M/39.1M [00:02<00:00, 17.9MB/s]

All files have been downloaded and saved in the '../dataset' folder.





In [4]:
train_file_path = '../dataset/train.jsonl'
test_file_path = '../dataset/test.jsonl'

In [5]:
train_df = pd.read_json(train_file_path, lines=True)
test_df = pd.read_json(test_file_path, lines=True)

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101284 entries, 0 to 101283
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   uid     101284 non-null  object
 1   text    101284 non-null  object
 2   extra   101284 non-null  object
 3   source  101284 non-null  object
 4   label   101284 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 3.9+ MB


In [7]:
train_df.head()

Unnamed: 0,uid,text,extra,source,label
0,[urlsf_subset00]-[15],The dangers of Illinois as a ‘right to work’ s...,"{'source': 'openweb', 'variant': 'original'}",openweb,0
1,[urlsf_subset00]-[15],"The governor of Illinois, Gov. Rauner, has req...","{'source': 'chatgpt', 'variant': 'original'}",chatgpt,1
2,[urlsf_subset00]-[83],Check current weather conditions\n\nIt’s going...,"{'source': 'openweb', 'variant': 'original'}",openweb,0
3,[urlsf_subset00]-[83],Check current weather conditions It’s going to...,"{'variant': 'original', 'source': 'llama'}",llama,1
4,[urlsf_subset00]-[89],"On Thursday, the president of the United State...","{'source': 'openweb', 'variant': 'original'}",openweb,0


In [8]:
train_df['text'] = train_df['text'].apply(text_process)
test_df['text'] = test_df['text'].apply(text_process)

In [9]:
X_train = train_df['text']
y_train = train_df['label']

X_test = test_df['text']
y_test = test_df['label']

In [10]:
# instantiate the TF-IDF vectorizer
vect = TfidfVectorizer()

# fit and transform X_train to create the document-term matrix
X_train_dtm = vect.fit_transform(X_train)

In [11]:
print(type(X_train_dtm), X_train_dtm.shape)

<class 'scipy.sparse._csr.csr_matrix'> (101284, 547607)


## Train model

#### Hyperparameter Optimization with Optuna

In [12]:
# Create sample dataset for hyperparameter tuning
X_train_sample = X_train_dtm[:5000]
y_train_sample = y_train[:5000]

In [13]:
def objective(trial):
    # Hyperparameter search space
    C = trial.suggest_float('C', 1e-1, 100, log=True)
    tol = trial.suggest_float('tol', 1e-5, 1e-1, log=True)
    max_iter = trial.suggest_int('max_iter', 100, 10000, step=100)
    shrinking = trial.suggest_categorical('shrinking', [True, False])

    # Define pipeline with fixed StandardScaler configuration
    pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean=False)),
        ('svm', svm.SVC(C=C, kernel='linear', tol=tol, max_iter=max_iter, shrinking=shrinking, random_state=42))
    ])

    # Cross-validation to evaluate performance
    scores = cross_val_score(pipeline, X_train_sample, y_train_sample, cv=5, scoring='f1')
    
    return scores.mean()

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(study_name='SVM-Linear-Bayesian-Optimization', direction='maximize', sampler=sampler)

[I 2025-01-23 12:36:49,527] A new study created in memory with name: SVM-Linear-Bayesian-Optimization


In [14]:
best_params = {}

if BAYESIAN_OPTIMIZATION:
    # Run Bayesian optimization
    study.optimize(objective, n_trials=50)

    best_params = study.best_params
else:
    # The best parameters found in the Bayesian optimization
    best_params = {
        'C': 0.433,
        'tol': 1.736 * 10**-5,
        'max_iter': 6000,
        'shrinking': True
    }

print("Best parameters:", best_params)

Best parameters: {'C': 0.433, 'tol': 1.736e-05, 'max_iter': 6000, 'shrinking': True}


In [15]:
svm_linear = LinearSVC(C=best_params['C'], tol=best_params['tol'], max_iter=best_params['max_iter'], random_state=42)

%time svm_linear.fit(X_train_dtm, y_train)

CPU times: user 1.33 s, sys: 71.1 ms, total: 1.4 s
Wall time: 1.48 s


## Test model

In [16]:
# transform X_test into a document-term matrix
X_test_dtm = vect.transform(X_test)

print(type(X_test_dtm), X_test_dtm.shape)

# make class predictions for X_test_dtm
y_pred = svm_linear.predict(X_test_dtm)

<class 'scipy.sparse._csr.csr_matrix'> (13952, 547607)


## Calculate metrics and update results file

In [17]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1 score: ', f1_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))

Accuracy:  0.8640338302752294
F1 score:  0.8615631613515288
Precision:  0.8775085476438234
Recall:  0.8461869266055045


In [18]:
test_df['pred_label'] = y_pred
test_df['correct_prediction'] = test_df['label'] == test_df['pred_label']

In [19]:
test_df.to_csv('../results/svm_test_results.csv', index=False)

In [20]:
all_results = pd.read_csv('../results/metrics_results.csv')

all_results = update_model_metrics(all_results, 'SVM', y_test, y_pred)


# Save the updated DataFrame back to the CSV file
all_results.to_csv('../results/metrics_results.csv', index=False)

all_results

Unnamed: 0,model,accuracy,precision,recall,f1_score
0,Logistic Regression,0.885,0.872,0.904,0.887
1,SVM,0.864,0.878,0.846,0.862
2,CNN,0.902,0.885,0.924,0.904
3,BiLSTM (RNN),0.935,0.954,0.914,0.934
4,BERT,0.963,0.952,0.974,0.963


## Save model

In [21]:
filename = '../saved_models/svm_model.pkl'
pickle.dump(svm_linear, open(filename, 'wb'))