**Data preparation**
--

In [None]:
from datasets import load_dataset

In [None]:
ds = load_dataset("wikimedia/wikipedia", "20231101.en")

In [None]:
# use the following line of code and the results if you want this to run faster
# (but only if you have enough RAM)
# some modifications will have to be done to the loop
df_ds = ds["train"].to_pandas()

In [None]:
# truncation=True automatically truncates to the maximum token length
# this way, the maximum amount of data is parsed through each pipeline
from transformers import pipeline
bias_pipeline = pipeline("text-classification", model="finiteautomata/bertweet-base-sentiment-analysis", truncation=True)

In [None]:
# this results variable is derived from a more efficient method,
# which is better than running sequentially
# however, if RAM-constrained,
# it is better to loop through the results one at a time
bias_results = bias_pipeline(df_ds["text"].tolist())

In [None]:
neutral = 0
positive = 0
negative = 0
i = 0

out_ds = {"text": [], "bias_label": []}

while neutral < 5000 or positive < 2500 or negative < 2500:
  bias = bias_results[i]["label"]

  if bias == "NEU" and neutral != 5000:
    out_ds["text"].append(ds["train"][i]["text"])
    out_ds["bias_label"].append("NEUTRAL")
    neutral += 1
  elif bias == "POS" and positive != 2500:
    out_ds["text"].append(ds["train"][i]["text"])
    out_ds["bias_label"].append("POSITIVE")
    positive += 1
  elif bias == "NEG" and negative != 2500:
    out_ds["text"].append(ds["train"][i]["text"])
    out_ds["bias_label"].append("NEGATIVE")
    negative += 1
  i += 1

In [None]:
# openpyxl is necessary to output an Excel spreadsheet
!pip install openpyxl
import pandas as pd
df = pd.DataFrame(out_ds)
df.to_excel("wikipedia_10000_bias_dataset.xlsx")

**EDA & Preprocessing**
--

In [None]:
import pandas as pd

In [None]:
import mimetypes
mimetypes.guess_type("/content/wikipedia_10000_bias_dataset.xlsx")

('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', None)

In [None]:
import os
os.listdir("/content")


['.config',
 '.ipynb_checkpoints',
 'wikipedia_10000_bias_dataset.xlsx',
 'sample_data']

In [None]:
data = pd.read_excel("/content/wikipedia_10000_bias_dataset.xlsx")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,bias_label,political_bias
0,0,Anarchism,Anarchism is a political philosophy and moveme...,NEUTRAL,
1,1,Albedo,Albedo (; ) is the fraction of sunlight that i...,NEUTRAL,
2,2,A,"A, or a, is the first letter and the first vow...",NEUTRAL,
3,3,Alabama,Alabama () is a state in the Southeastern regi...,NEUTRAL,
4,4,Achilles,"In Greek mythology, Achilles ( ) or Achilleus ...",POSITIVE,REPUBLICAN


In [None]:
data['political_bias'].value_counts()

Unnamed: 0_level_0,count
political_bias,Unnamed: 1_level_1
REPUBLICAN,2937
DEMOCRAT,2063


In [None]:
data = data.drop('political_bias', axis=1)

In [None]:
data = data.drop('Unnamed: 0', axis=1)

In [None]:
data.isnull().sum()

Unnamed: 0,0
title,0
text,0
bias_label,0


In [None]:
data['bias_label'].value_counts()

Unnamed: 0_level_0,count
bias_label,Unnamed: 1_level_1
NEUTRAL,5000
POSITIVE,2500
NEGATIVE,2500


In [None]:
data.dtypes

Unnamed: 0,0
title,object
text,object
bias_label,object


**Model Training**
--

**TF-IDF**
--

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X = data["text"]
y = data["bias_label"]

# Train-test split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train samples:", len(X_train_text))
print("Test samples:", len(X_test_text))

# Create the TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    stop_words="english"
)

# Fit ONLY on training text (to avoid data leakage)
tfidf.fit(X_train_text)

# Transform train and test text
X_train_tfidf = tfidf.transform(X_train_text)
X_test_tfidf  = tfidf.transform(X_test_text)

print("TF-IDF train shape:", X_train_tfidf.shape)
print("TF-IDF test shape:", X_test_tfidf.shape)

vocab = list(tfidf.vocabulary_.keys())
print("Sample vocabulary words:", vocab[:20])

Train samples: 8000
Test samples: 2000
TF-IDF train shape: (8000, 50000)
TF-IDF test shape: (2000, 50000)
Sample vocabulary words: ['prison', 'camp', 'split', 'croatia', 'active', '1992', '1997', 'mainly', 'serbian', 'residents', 'prisoners', 'war', 'imprisoned', 'croatian', 'independence', 'site', 'human', 'rights', 'abuses', 'resulting']


**LR**
--

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# classifier
clf = LogisticRegression(
    class_weight="balanced",
    max_iter=2000,
    n_jobs=-1,
    multi_class="auto"
)

# Train on TF-IDF features
clf.fit(X_train_tfidf, y_train)

#  Evaluate on test set
y_pred = clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

    NEGATIVE       0.81      0.80      0.81       478
     NEUTRAL       0.79      0.75      0.77      1004
    POSITIVE       0.63      0.71      0.67       518

    accuracy                           0.75      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.76      0.75      0.75      2000



**SVC**
--

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

svm = LinearSVC(class_weight="balanced")

svm.fit(X_train_tfidf, y_train)

y_pred_svm = svm.predict(X_test_tfidf)

print(classification_report(y_test, y_pred_svm))


              precision    recall  f1-score   support

    NEGATIVE       0.83      0.80      0.82       478
     NEUTRAL       0.75      0.80      0.78      1004
    POSITIVE       0.66      0.61      0.63       518

    accuracy                           0.75      2000
   macro avg       0.75      0.74      0.74      2000
weighted avg       0.75      0.75      0.75      2000



In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(
    loss="log_loss",       # or "hinge" for SVM-like
    class_weight="balanced",
    max_iter=2000,
    tol=1e-3
)

sgd.fit(X_train_tfidf, y_train)
y_pred_sgd = sgd.predict(X_test_tfidf)

print(classification_report(y_test, y_pred_sgd))


              precision    recall  f1-score   support

    NEGATIVE       0.84      0.77      0.80       478
     NEUTRAL       0.76      0.79      0.78      1004
    POSITIVE       0.65      0.64      0.64       518

    accuracy                           0.75      2000
   macro avg       0.75      0.74      0.74      2000
weighted avg       0.75      0.75      0.75      2000



**Calibrated LR**
--

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

# Base logistic regression (NOT fitted yet)
base_lr = LogisticRegression(
    class_weight="balanced",
    max_iter=2000,
    n_jobs=-1  # use all cores
    # do NOT set multi_class, keep default to avoid the warning
)

# Wrap with probability calibration
calib_lr = CalibratedClassifierCV(
    estimator=base_lr,
    method="isotonic",   # "sigmoid" is faster if isotonic is too slow
    cv=5                 # 5-fold internal CV
)

# Fit on TF-IDF features and labels
calib_lr.fit(X_train_tfidf, y_train)

# Evaluate on test set
y_pred_calib = calib_lr.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_calib))


              precision    recall  f1-score   support

    NEGATIVE       0.83      0.77      0.80       478
     NEUTRAL       0.75      0.80      0.77      1004
    POSITIVE       0.65      0.62      0.64       518

    accuracy                           0.74      2000
   macro avg       0.74      0.73      0.74      2000
weighted avg       0.74      0.74      0.74      2000

