In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import polars as pl
from datasets import load_dataset
import re
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def fast_clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+|[^a-zA-Z\s]", "", text)
    tokens = re.findall(r'\b\w+\b', text)
    return " ".join([t for t in tokens if t not in stop_words])

In [None]:
dataset = load_dataset("artem9k/ai-text-detection-pile")["train"]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
seen, texts, labels = set(), [], []
for ex in dataset:
    if ex["text"] and ex["text"].strip() and ex["text"] not in seen:
        seen.add(ex["text"])
        texts.append(ex["text"])
        labels.append(1 if ex["source"] == "ai" else 0)

In [None]:
df = pl.DataFrame({"text": texts, "label": labels})

In [None]:
cleaned = [fast_clean(t) for t in df["text"]]
df = df.with_columns(pl.Series(name="cleaned_text", values=cleaned))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_val, y_train, y_val = train_test_split(
    df["cleaned_text"].to_list(), df["label"].to_list(),
    test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 1),
    sublinear_tf=True,
    dtype='float32'
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr = LogisticRegression(max_iter=1000, n_jobs=-1)
lr.fit(X_train_tfidf, y_train)
print("Logistic Regression\n", classification_report(y_val, lr.predict(X_val_tfidf)))


Logistic Regression
               precision    recall  f1-score   support

           0       0.93      0.94      0.94    204218
           1       0.83      0.81      0.82     72951

    accuracy                           0.91    277169
   macro avg       0.88      0.88      0.88    277169
weighted avg       0.91      0.91      0.91    277169



In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf.fit(X_train_tfidf, y_train)
print("RF\n", classification_report(y_val, rf.predict(X_val_tfidf)))

RF
               precision    recall  f1-score   support

           0       0.89      0.94      0.92    204218
           1       0.81      0.68      0.74     72951

    accuracy                           0.88    277169
   macro avg       0.85      0.81      0.83    277169
weighted avg       0.87      0.88      0.87    277169



In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
xgb.fit(X_train_tfidf, y_train)
print("XGBoost\n", classification_report(y_val, xgb.predict(X_val_tfidf)))

Parameters: { "use_label_encoder" } are not used.



XGBoost
               precision    recall  f1-score   support

           0       0.89      0.93      0.91    204218
           1       0.79      0.69      0.74     72951

    accuracy                           0.87    277169
   macro avg       0.84      0.81      0.83    277169
weighted avg       0.87      0.87      0.87    277169



In [None]:
! pip install scikit-learn-intelex polars datasets scikit-learn xgboost

Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2025.4.0-py311-none-manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting daal==2025.4.0 (from scikit-learn-intelex)
  Downloading daal-2025.4.0-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Downloading scikit_learn_intelex-2025.4.0-py311-none-manylinux_2_28_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading daal-2025.4.0-py2.py3-none-manylinux_2_28_x86_64.whl (112.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.1/112.1 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: daal, scikit-learn-intelex
Successfully installed daal-2025.4.0 scikit-learn-intelex-2025.4.0


In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [None]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer(norm='l2')
X_train_norm = scaler.fit_transform(X_train_tfidf)
X_val_norm = scaler.transform(X_val_tfidf)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

svm = SVC(kernel='rbf', C=1.0, probability=True)
svm.fit(X_train_norm, y_train)

y_pred_svm = svm.predict(X_val_norm)
print("Intel-Optimized SVM\n", classification_report(y_val, y_pred_svm))




Intel-Optimized SVM
               precision    recall  f1-score   support

           0       0.95      0.96      0.96    204218
           1       0.88      0.87      0.88     72951

    accuracy                           0.94    277169
   macro avg       0.92      0.92      0.92    277169
weighted avg       0.94      0.94      0.94    277169

