In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import joblib
import time

Aggregated Categories Excel file

In [None]:
# Load the Aggregated Categories Excel file
df2 = pd.read_excel('/content/drive/MyDrive/DAT620/Categories_manual_adapted.xlsx')

df2.columns = ["category", "count", "cat-l2", "cat-l3"]

print(df2.head())

     category  count      cat-l2      cat-l3
0        news  49484         NaN         NaN
1        free  47348         NaN         NaN
2     politik  39700     politik     politik
3       sport  32455       sport       sport
4  wirtschaft  31184  wirtschaft  wirtschaft


News data file converted in CSV

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DAT620/data.csv')

df.columns = ["new_number", "news_text", "category"]
print(df.head())

               new_number                                          news_text  \
0  NID_2023-01-19-02_3182  Forscher sehen enorm Aufholbedarf bei CO2 Entn...   
1  NID_2023-01-19-04_5957  Museum Winsen Wie ein klein Museum groß Idee u...   
2  NID_2023-01-19-04_6983  quartier in Harburg der neu Kümmerer beziehen ...   
3  NID_2023-01-19-04_3777  Grundsteuer Frist für Steuererklärung laufen a...   
4  NID_2023-01-19-06_4993  Kreis Pinneberg Elmshorn legen sein Vision der...   

       category  
0  Wissenschaft  
1  Lkr. Harburg  
2       Harburg  
3   Norderstedt  
4     Pinneberg  


Mapping the Aggregated categories to the news data


In [None]:
mapping = df2.set_index('category')['cat-l3'].to_dict()

# Map the 'category' column in df1 to the 'cat-l3' values
df['cat-l3'] = df['category'].map(mapping)

print(df.head())

               new_number                                          news_text  \
0  NID_2023-01-19-02_3182  Forscher sehen enorm Aufholbedarf bei CO2 Entn...   
1  NID_2023-01-19-04_5957  Museum Winsen Wie ein klein Museum groß Idee u...   
2  NID_2023-01-19-04_6983  quartier in Harburg der neu Kümmerer beziehen ...   
3  NID_2023-01-19-04_3777  Grundsteuer Frist für Steuererklärung laufen a...   
4  NID_2023-01-19-06_4993  Kreis Pinneberg Elmshorn legen sein Vision der...   

       category cat-l3  
0  Wissenschaft    NaN  
1  Lkr. Harburg    NaN  
2       Harburg    NaN  
3   Norderstedt    NaN  
4     Pinneberg    NaN  


In [None]:
# Drop rows where 'cat-l3' column has NaN values
df = df.dropna(subset=['cat-l3'])

print(df.head())


                 new_number  \
62   NID_2023-01-19-17_9205   
79   NID_2023-01-19-20_8799   
178  NID_2023-01-20-13_4755   
751  NID_2023-01-20-20_4779   
953  NID_2023-01-20-21_9183   

                                             news_text category   cat-l3  
62   Vorsicht Legionellen Wie Vermieter vorsorgen u...    klima    klima  
79   unser Nachbar der Dealer Mieter Aufstand gegen...  netflix  netflix  
178  Kindheit und Jugend der Phase der Hasen Trotzp...  familie  familie  
751  Kitzbühel Rückkehr der Weißwurstparty in vor C...    sport    sport  
953  vor Duell gegen Bochum Herthas Trainer euphori...    sport    sport  


In [None]:
df

Unnamed: 0,new_number,news_text,category,cat-l3
62,NID_2023-01-19-17_9205,Vorsicht Legionellen Wie Vermieter vorsorgen u...,klima,klima
79,NID_2023-01-19-20_8799,unser Nachbar der Dealer Mieter Aufstand gegen...,netflix,netflix
178,NID_2023-01-20-13_4755,Kindheit und Jugend der Phase der Hasen Trotzp...,familie,familie
751,NID_2023-01-20-20_4779,Kitzbühel Rückkehr der Weißwurstparty in vor C...,sport,sport
953,NID_2023-01-20-21_9183,vor Duell gegen Bochum Herthas Trainer euphori...,sport,sport
...,...,...,...,...
985529,NID_2023-02-03-20_4508,Eurojackpot an der Freitag 03 02 2023 Gewinnza...,panorama,sonstiges
985530,NID_2023-02-03-20_6062,Groundhog Day Murmeltier sagen in USA sechs we...,panorama,sonstiges
985535,NID_2023-02-03-20_6435,Heidi Klum und Tom Kaulitz wollen sie ein geme...,unterhaltung,unterhaltung
985542,NID_2023-02-03-20_14365,Udo Lindenberg feiern erst Nummer ein Hit seit...,unterhaltung,unterhaltung


Load and preprocess the updated file with the aggregated categories

In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/DAT620/new_data_updated.csv')

df.columns = ["new_number", "news_text", "category", "cat-l3"]

# Drop rows with missing values
df = df.dropna(subset=["news_text", "cat-l3"])

X = df["news_text"].values
y = df["cat-l3"].values

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)


In [None]:
# Get the unique number of classes in the cat-l3 column
unique_classes = len(set(y))

print(f"Number of unique classes in cat-l3: {unique_classes}")

Number of unique classes in cat-l3: 105


Get the top 30 Categories

In [None]:
# Filter to top 30 most frequent classes
class_counts = Counter(y_train)
top_30_classes = [cls for cls, count in class_counts.most_common(30)]

train_mask = np.array([label in top_30_classes for label in y_train])
test_mask = np.array([label in top_30_classes for label in y_test])

X_train_filtered = X_train[train_mask]
y_train_filtered = y_train[train_mask]
X_test_filtered = X_test[test_mask]
y_test_filtered = y_test[test_mask]

print("Training samples after filtering:", len(X_train_filtered))
print("Test samples after filtering:", len(X_test_filtered))
print("Unique classes after filtering:", len(np.unique(y_train_filtered)))

Training samples after filtering: 9612
Test samples after filtering: 2410
Unique classes after filtering: 30


Feature engineering using TF-IDF

In [None]:
# Create TF-IDF features on the filtered dataset
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

german_stopwords = stopwords.words('german')

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    stop_words=german_stopwords
)

X_train_tfidf = tfidf.fit_transform(X_train_filtered)
X_test_tfidf = tfidf.transform(X_test_filtered)

print("X_train_tfidf shape:", X_train_tfidf.shape)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


X_train_tfidf shape: (9612, 10000)


In [None]:
# Encode labels
le = LabelEncoder()
y_train_le = le.fit_transform(y_train_filtered)
y_test_le = le.transform(y_test_filtered)

# Save the label encoder
joblib.dump(le, "/content/drive/MyDrive/DAT620/label_encoder.joblib")

['/content/drive/MyDrive/DAT620/label_encoder.joblib']

Train Classical machine learning models on the data

In [None]:
# Train and Evaluate Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
start_time = time.time()
log_reg.fit(X_train_tfidf, y_train_le)
train_time = time.time() - start_time

y_pred_lr = log_reg.predict(X_test_tfidf)
acc_lr = accuracy_score(y_test_le, y_pred_lr)
print("Model: LogisticRegression")
print(f"Training Time: {train_time:.2f} seconds")
print("Accuracy: {:.4f}".format(acc_lr))
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_lr)))

# Save the model
joblib.dump(log_reg, "/content/drive/MyDrive/DAT620/logistic_regression_model.joblib")

Model: LogisticRegression
Training Time: 12.51 seconds
Accuracy: 0.7660
               precision    recall  f1-score   support

         asyl       0.00      0.00      0.00        11
         auto       0.92      0.68      0.78        71
demonstration       0.12      0.04      0.06        24
      familie       0.00      0.00      0.00        13
    fernsehen       0.94      0.73      0.82        45
   gesundheit       1.00      0.18      0.30        17
       gruene       0.00      0.00      0.00         6
     karneval       0.90      0.66      0.76        68
        klima       0.60      0.20      0.30        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.79      0.74      0.76       120
        leben       1.00      0.12      0.22        32
        leute       0.00      0.00      0.00        12
    migration       0.50      0.11      0.18         9
       museum       0.00      0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['/content/drive/MyDrive/DAT620/logistic_regression_model.joblib']

In [None]:
# Train and Evaluate Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
start_time = time.time()
rf.fit(X_train_tfidf, y_train_le)
train_time = time.time() - start_time

y_pred_rf = rf.predict(X_test_tfidf)
acc_rf = accuracy_score(y_test_le, y_pred_rf)
print("Model: RandomForest")
print(f"Training Time: {train_time:.2f} seconds")
print("Accuracy: {:.4f}".format(acc_rf))
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_rf)))

# Save the model
joblib.dump(rf, "/content/drive/MyDrive/DAT620/random_forest_model.joblib")

Model: RandomForest
Training Time: 16.63 seconds
Accuracy: 0.7714
               precision    recall  f1-score   support

         asyl       0.50      0.27      0.35        11
         auto       0.75      0.70      0.72        71
demonstration       0.17      0.08      0.11        24
      familie       0.73      0.62      0.67        13
    fernsehen       0.92      0.80      0.86        45
   gesundheit       0.55      0.35      0.43        17
       gruene       0.57      0.67      0.62         6
     karneval       0.68      0.69      0.69        68
        klima       0.50      0.33      0.40        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.79      0.74      0.77       120
        leben       0.88      0.44      0.58        32
        leute       0.00      0.00      0.00        12
    migration       0.25      0.22      0.24         9
       museum       1.00      0.11      0.20         

['/content/drive/MyDrive/DAT620/random_forest_model.joblib']

In [None]:
# Train and Evaluate XGBoost
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
)

start_time = time.time()
xgb_clf.fit(X_train_tfidf, y_train_le)
train_time = time.time() - start_time

y_pred_xgb = xgb_clf.predict(X_test_tfidf)
acc_xgb = accuracy_score(y_test_le, y_pred_xgb)
print("Model: XGBoost (GPU Hist)")
print(f"Training Time: {train_time:.2f} seconds")
print("Accuracy: {:.4f}".format(acc_xgb))
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_xgb)))

# Save the model
joblib.dump(xgb_clf, "/content/drive/MyDrive/DAT620/xgboost_model.joblib")

Parameters: { "use_label_encoder" } are not used.



Model: XGBoost (GPU Hist)
Training Time: 41.30 seconds
Accuracy: 0.7635
               precision    recall  f1-score   support

         asyl       0.25      0.09      0.13        11
         auto       0.73      0.65      0.69        71
demonstration       0.24      0.17      0.20        24
      familie       0.67      0.46      0.55        13
    fernsehen       0.87      0.76      0.81        45
   gesundheit       0.53      0.53      0.53        17
       gruene       1.00      0.50      0.67         6
     karneval       0.75      0.76      0.76        68
        klima       0.50      0.47      0.48        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.77      0.75      0.76       120
        leben       0.73      0.50      0.59        32
        leute       0.00      0.00      0.00        12
    migration       0.30      0.33      0.32         9
       museum       1.00      0.22      0.36   

['/content/drive/MyDrive/DAT620/xgboost_model.joblib']

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import time

# Train and Evaluate KNN
knn = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn.fit(X_train_tfidf, y_train_le)
train_time = time.time() - start_time

y_pred_knn = knn.predict(X_test_tfidf)
acc_knn = accuracy_score(y_test_le, y_pred_knn)
print("\nModel: K-Nearest Neighbors")
print(f"Training Time: {train_time:.2f} seconds")
print("Accuracy: {:.4f}".format(acc_knn))
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_knn)))

# Save the model
joblib.dump(knn, "/content/drive/MyDrive/DAT620/knn_model.joblib")


Model: K-Nearest Neighbors
Training Time: 0.00 seconds
Accuracy: 0.7203
               precision    recall  f1-score   support

         asyl       0.31      0.45      0.37        11
         auto       0.60      0.72      0.65        71
demonstration       0.24      0.33      0.28        24
      familie       0.50      0.31      0.38        13
    fernsehen       0.63      0.53      0.58        45
   gesundheit       0.30      0.41      0.35        17
       gruene       0.25      0.33      0.29         6
     karneval       0.67      0.69      0.68        68
        klima       0.20      0.20      0.20        15
    konflikte       0.14      0.09      0.11        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.61      0.69      0.65       120
        leben       0.74      0.44      0.55        32
        leute       0.00      0.00      0.00        12
    migration       0.33      0.22      0.27         9
       museum       0.40      0.22      0.29  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['/content/drive/MyDrive/DAT620/knn_model.joblib']

In [None]:
from sklearn.svm import SVC

# Train and Evaluate SVM
svm = SVC(kernel="linear", random_state=42)
start_time = time.time()
svm.fit(X_train_tfidf, y_train_le)
train_time = time.time() - start_time

y_pred_svm = svm.predict(X_test_tfidf)
acc_svm = accuracy_score(y_test_le, y_pred_svm)
print("\nModel: Support Vector Machine (SVM)")
print(f"Training Time: {train_time:.2f} seconds")
print("Accuracy: {:.4f}".format(acc_svm))
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_svm)))

# Save the model
joblib.dump(svm, "/content/drive/MyDrive/DAT620/svm_model.joblib")


Model: Support Vector Machine (SVM)
Training Time: 18.87 seconds
Accuracy: 0.8120
               precision    recall  f1-score   support

         asyl       0.67      0.55      0.60        11
         auto       0.88      0.79      0.83        71
demonstration       0.25      0.12      0.17        24
      familie       0.71      0.38      0.50        13
    fernsehen       0.88      0.84      0.86        45
   gesundheit       0.67      0.35      0.46        17
       gruene       0.00      0.00      0.00         6
     karneval       0.90      0.78      0.83        68
        klima       0.50      0.40      0.44        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.82      0.78      0.80       120
        leben       0.88      0.47      0.61        32
        leute       0.00      0.00      0.00        12
    migration       0.33      0.22      0.27         9
       museum       1.00      0.44  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['/content/drive/MyDrive/DAT620/svm_model.joblib']

Load the saved models and get the accuracy on the test set

In [None]:
import joblib
from sklearn.metrics import classification_report, accuracy_score

# Load the models and label encoder
log_reg = joblib.load("/content/drive/MyDrive/DAT620/logistic_regression_model.joblib")
rf = joblib.load("/content/drive/MyDrive/DAT620/random_forest_model.joblib")
xgb_clf = joblib.load("/content/drive/MyDrive/DAT620/xgboost_model.joblib")
knn = joblib.load("/content/drive/MyDrive/DAT620/knn_model.joblib")
svm = joblib.load("/content/drive/MyDrive/DAT620/svm_model.joblib")
le = joblib.load("/content/drive/MyDrive/DAT620/label_encoder.joblib")

# Evaluate Logistic Regression
y_pred_lr = log_reg.predict(X_test_tfidf)
acc_lr = accuracy_score(y_test_le, y_pred_lr)
print("\nLoaded LogisticRegression model accuracy:", acc_lr)
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_lr)))

# Evaluate Random Forest
y_pred_rf = rf.predict(X_test_tfidf)
acc_rf = accuracy_score(y_test_le, y_pred_rf)
print("\nLoaded RandomForest model accuracy:", acc_rf)
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_rf)))

# Evaluate XGBoost
y_pred_xgb = xgb_clf.predict(X_test_tfidf)
acc_xgb = accuracy_score(y_test_le, y_pred_xgb)
print("\nLoaded XGBoost model accuracy:", acc_xgb)
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_xgb)))

# Evaluate KNN
y_pred_knn = knn.predict(X_test_tfidf)
acc_knn = accuracy_score(y_test_le, y_pred_knn)
print("\nLoaded KNN model accuracy:", acc_knn)
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_knn)))

# Evaluate SVM
y_pred_svm = svm.predict(X_test_tfidf)
acc_svm = accuracy_score(y_test_le, y_pred_svm)
print("\nLoaded SVM model accuracy:", acc_svm)
print(classification_report(y_test_filtered, le.inverse_transform(y_pred_svm)))


Loaded LogisticRegression model accuracy: 0.7659751037344399
               precision    recall  f1-score   support

         asyl       0.00      0.00      0.00        11
         auto       0.92      0.68      0.78        71
demonstration       0.12      0.04      0.06        24
      familie       0.00      0.00      0.00        13
    fernsehen       0.94      0.73      0.82        45
   gesundheit       1.00      0.18      0.30        17
       gruene       0.00      0.00      0.00         6
     karneval       0.90      0.66      0.76        68
        klima       0.60      0.20      0.30        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.79      0.74      0.76       120
        leben       1.00      0.12      0.22        32
        leute       0.00      0.00      0.00        12
    migration       0.50      0.11      0.18         9
       museum       0.00      0.00      0.00         9
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

         asyl       0.50      0.27      0.35        11
         auto       0.75      0.70      0.72        71
demonstration       0.17      0.08      0.11        24
      familie       0.73      0.62      0.67        13
    fernsehen       0.92      0.80      0.86        45
   gesundheit       0.55      0.35      0.43        17
       gruene       0.57      0.67      0.62         6
     karneval       0.68      0.69      0.69        68
        klima       0.50      0.33      0.40        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.79      0.74      0.77       120
        leben       0.88      0.44      0.58        32
        leute       0.00      0.00      0.00        12
    migration       0.25      0.22      0.24         9
       museum       1.00      0.11      0.20         9
      netflix       1.00      0.50      0.67         8
      pol

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Loaded SVM model accuracy: 0.8120331950207469
               precision    recall  f1-score   support

         asyl       0.67      0.55      0.60        11
         auto       0.88      0.79      0.83        71
demonstration       0.25      0.12      0.17        24
      familie       0.71      0.38      0.50        13
    fernsehen       0.88      0.84      0.86        45
   gesundheit       0.67      0.35      0.46        17
       gruene       0.00      0.00      0.00         6
     karneval       0.90      0.78      0.83        68
        klima       0.50      0.40      0.44        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.82      0.78      0.80       120
        leben       0.88      0.47      0.61        32
        leute       0.00      0.00      0.00        12
    migration       0.33      0.22      0.27         9
       museum       1.00      0.44      0.62         9
      netflix    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Transformer models

In [None]:
!pip install transformers datasets evaluate --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
from datasets import Dataset
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate

In [None]:
# Prepare Hugging Face Datasets and Encode Labels

# Hugging Face Datasets
df_train = pd.DataFrame({"text": X_train_filtered, "label": y_train_filtered})
df_test = pd.DataFrame({"text": X_test_filtered, "label": y_test_filtered})

train_ds = Dataset.from_pandas(df_train)
test_ds = Dataset.from_pandas(df_test)

# Encode labels using the LabelEncoder
def encode_labels(examples):
    examples["label"] = le.transform(examples["label"])
    return examples

train_ds = train_ds.map(encode_labels, batched=True)
test_ds = test_ds.map(encode_labels, batched=True)

# labels after encoding
print("Sample encoded labels (train_ds):", train_ds["label"][:10])
print("Sample encoded labels (test_ds):", test_ds["label"][:10])

Map:   0%|          | 0/9612 [00:00<?, ? examples/s]

Map:   0%|          | 0/2410 [00:00<?, ? examples/s]

Sample encoded labels (train_ds): [22, 28, 22, 21, 17, 25, 18, 22, 17, 29]
Sample encoded labels (test_ds): [22, 17, 17, 22, 25, 18, 28, 22, 25, 17]


DistilBERT

In [None]:
# Tokenize and Prepare the Dataset for Transformer Models

model_name = "distilbert-base-german-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Tokenize the datasets
train_tokenized = train_ds.map(tokenize_function, batched=True)
test_tokenized = test_ds.map(tokenize_function, batched=True)

print("Train Tokenized Columns:", train_tokenized.column_names)
print("Test Tokenized Columns:", test_tokenized.column_names)

columns_to_remove = ["text"]
train_tokenized = train_tokenized.remove_columns(columns_to_remove)
test_tokenized = test_tokenized.remove_columns(columns_to_remove)

# Rename 'label' to 'labels' to match Hugging Face expectations
train_tokenized = train_tokenized.rename_column("label", "labels")
test_tokenized = test_tokenized.rename_column("label", "labels")

# Set the format to PyTorch tensors
train_tokenized.set_format("torch")
test_tokenized.set_format("torch")

# final columns
print("Final Train Tokenized Columns:", train_tokenized.column_names)
print("Final Test Tokenized Columns:", test_tokenized.column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/479k [00:00<?, ?B/s]

Map:   0%|          | 0/9612 [00:00<?, ? examples/s]

Map:   0%|          | 0/2410 [00:00<?, ? examples/s]

Train Tokenized Columns: ['text', 'label', 'input_ids', 'attention_mask']
Test Tokenized Columns: ['text', 'label', 'input_ids', 'attention_mask']
Final Train Tokenized Columns: ['labels', 'input_ids', 'attention_mask']
Final Test Tokenized Columns: ['labels', 'input_ids', 'attention_mask']


In [None]:
# Load the Pre-trained Transformer Model and Setup Trainer

num_labels = len(le.classes_)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return metric.compute(predictions=preds, references=labels)

# Setup training arguments with matching evaluation and save strategies
training_args = TrainingArguments(
    output_dir="./german_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Trainer is using device:", trainer.args.device)

model.safetensors:   0%|          | 0.00/270M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  trainer = Trainer(


Trainer is using device: cuda:0


In [None]:
# Train the Model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.657,0.667951,0.819917
2,0.427,0.574166,0.843568
3,0.2833,0.558182,0.855187


TrainOutput(global_step=1803, training_loss=0.6221114103158051, metrics={'train_runtime': 209.2689, 'train_samples_per_second': 137.794, 'train_steps_per_second': 8.616, 'total_flos': 1910868651233280.0, 'train_loss': 0.6221114103158051, 'epoch': 3.0})

In [None]:
results = trainer.evaluate(test_tokenized)
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 1.0080560445785522, 'eval_accuracy': 0.6208271815619427, 'eval_runtime': 95.1718, 'eval_samples_per_second': 547.988, 'eval_steps_per_second': 34.254, 'epoch': 3.0}


In [None]:
# Save the Fine-Tuned Model
trainer.save_model("/content/drive/MyDrive/DAT620/german_finetuned_model")

XLM-RoBERTa

In [None]:
# Tokenize the Dataset

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Tokenize the datasets
train_tokenized = train_ds.map(tokenize_function, batched=True)
test_tokenized = test_ds.map(tokenize_function, batched=True)

# Remove the original text column
train_tokenized = train_tokenized.remove_columns(["text"])
test_tokenized = test_tokenized.remove_columns(["text"])

# Rename 'label' to 'labels' as expected by the Trainer
train_tokenized = train_tokenized.rename_column("label", "labels")
test_tokenized = test_tokenized.rename_column("label", "labels")

# Set the format to PyTorch tensors
train_tokenized.set_format("torch")
test_tokenized.set_format("torch")

# final columns and data types
print("Final Train Tokenized Columns:", train_tokenized.column_names)
print("Sample labels (train_tokenized):", train_tokenized["labels"][:10])
print("Sample labels (test_tokenized):", test_tokenized["labels"][:10])

In [None]:
# Setup Training Arguments and Initialize Trainer

num_labels = len(le.classes_)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Evaluation metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return metric.compute(predictions=preds, references=labels)

# Setup training arguments with matching evaluation and save strategies
training_args = TrainingArguments(
    output_dir="./german_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Trainer is using device:", trainer.args.device)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9808,1.090915,0.711203
2,0.7208,0.80898,0.790041
3,0.5779,0.67817,0.823237
4,0.4699,0.649726,0.840664
5,0.3742,0.643791,0.842324


TrainOutput(global_step=3005, training_loss=0.7734313582025233, metrics={'train_runtime': 618.6535, 'train_samples_per_second': 77.685, 'train_steps_per_second': 4.857, 'total_flos': 6324148155985920.0, 'train_loss': 0.7734313582025233, 'epoch': 5.0})

In [None]:
results = trainer.evaluate(test_tokenized)
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 0.643790602684021, 'eval_accuracy': 0.8423236514522822, 'eval_runtime': 8.0157, 'eval_samples_per_second': 300.659, 'eval_steps_per_second': 18.838, 'epoch': 5.0}


In [None]:
trainer.save_model("/content/drive/MyDrive/DAT620/german_finetuned_model_roberta")

Finetuning the DistilBERT

In [None]:
# Load the Pre-trained Transformer Model and Setup Trainer

num_labels = len(le.classes_)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Evaluation metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return metric.compute(predictions=preds, references=labels)

# Setup training arguments with matching evaluation and save strategies
training_args = TrainingArguments(
    output_dir="./german_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",                # Save model at the end of each epoch
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Trainer is using device:", trainer.args.device)

model.safetensors:   0%|          | 0.00/270M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  trainer = Trainer(


Trainer is using device: cuda:0


In [None]:
# Cell 7: Train the Model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6584,0.661134,0.822407
2,0.4463,0.576744,0.839834
3,0.3232,0.605236,0.851452
4,0.2132,0.618115,0.853527
5,0.1763,0.655198,0.851452
6,0.109,0.668617,0.860581
7,0.127,0.713079,0.857676
8,0.0655,0.70985,0.861411


TrainOutput(global_step=4808, training_loss=0.3170946484273563, metrics={'train_runtime': 479.9423, 'train_samples_per_second': 160.219, 'train_steps_per_second': 10.018, 'total_flos': 5095649736622080.0, 'train_loss': 0.3170946484273563, 'epoch': 8.0})

In [None]:
results = trainer.evaluate(test_tokenized)
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 0.7098497152328491, 'eval_accuracy': 0.8614107883817428, 'eval_runtime': 4.3348, 'eval_samples_per_second': 555.964, 'eval_steps_per_second': 34.834, 'epoch': 8.0}


In [None]:
# Cell 9: Save the Fine-Tuned Model
trainer.save_model("/content/drive/MyDrive/DAT620/german_finetuned_model_updated")

Evaluate Classical machine learning models

In [None]:
import joblib
from sklearn.metrics import classification_report, accuracy_score

# Load the models and label encoder
log_reg = joblib.load("/content/drive/MyDrive/DAT620/logistic_regression_model.joblib")
rf = joblib.load("/content/drive/MyDrive/DAT620/random_forest_model.joblib")
xgb_clf = joblib.load("/content/drive/MyDrive/DAT620/xgboost_model.joblib")
knn = joblib.load("/content/drive/MyDrive/DAT620/knn_model.joblib")
svm = joblib.load("/content/drive/MyDrive/DAT620/svm_model.joblib")
le = joblib.load("/content/drive/MyDrive/DAT620/label_encoder.joblib")

# Evaluate Logistic Regression
y_pred_lr = log_reg.predict(X_test_tfidf)
acc_lr = accuracy_score(y_test_le, y_pred_lr)
print("\nLoaded LogisticRegression model accuracy:", acc_lr)

# Evaluate Random Forest
y_pred_rf = rf.predict(X_test_tfidf)
acc_rf = accuracy_score(y_test_le, y_pred_rf)
print("\nLoaded RandomForest model accuracy:", acc_rf)

# Evaluate XGBoost
y_pred_xgb = xgb_clf.predict(X_test_tfidf)
acc_xgb = accuracy_score(y_test_le, y_pred_xgb)
print("\nLoaded XGBoost model accuracy:", acc_xgb)

# Evaluate KNN
y_pred_knn = knn.predict(X_test_tfidf)
acc_knn = accuracy_score(y_test_le, y_pred_knn)
print("\nLoaded KNN model accuracy:", acc_knn)

# Evaluate SVM
y_pred_svm = svm.predict(X_test_tfidf)
acc_svm = accuracy_score(y_test_le, y_pred_svm)
print("\nLoaded SVM model accuracy:", acc_svm)


Loaded LogisticRegression model accuracy: 0.7659751037344399

Loaded RandomForest model accuracy: 0.7713692946058092

Loaded XGBoost model accuracy: 0.7634854771784232

Loaded KNN model accuracy: 0.7203319502074689

Loaded SVM model accuracy: 0.8120331950207469


Evaluate the Transformer model (DistilBERT)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Load the saved model and tokenizer
model_path = "/content/drive/MyDrive/DAT620/german_finetuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare the test dataset
test_dataloader = torch.utils.data.DataLoader(
    test_tokenized, batch_size=16, shuffle=False
)

all_predictions = []
all_labels = []

# Iterate through the test dataloader for prediction
with torch.no_grad():
    for batch in test_dataloader:
        inputs = {key: value.to(device) for key, value in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        # Get predictions from the model
        outputs = model(**inputs)
        logits = outputs.logits

        # Get predicted class indices
        predictions = torch.argmax(logits, dim=1)

        # Store predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert numeric predictions back to labels
predicted_labels = le.inverse_transform(all_predictions)
true_labels = le.inverse_transform(all_labels)

# Evaluate accuracy
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Test Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))


Test Accuracy: 0.8552

Classification Report:
               precision    recall  f1-score   support

         asyl       0.48      0.91      0.62        11
         auto       0.98      0.90      0.94        71
demonstration       0.67      0.25      0.36        24
      familie       0.75      0.46      0.57        13
    fernsehen       0.73      0.89      0.80        45
   gesundheit       0.75      0.71      0.73        17
       gruene       0.50      0.17      0.25         6
     karneval       0.91      0.90      0.90        68
        klima       0.40      0.67      0.50        15
    konflikte       0.00      0.00      0.00        11
        krieg       0.00      0.00      0.00         7
kriminalitaet       0.89      0.78      0.83       120
        leben       0.65      0.75      0.70        32
        leute       0.00      0.00      0.00        12
    migration       1.00      0.44      0.62         9
       museum       0.75      0.67      0.71         9
      netflix     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Predict category using the saved model (XLM-RoBERTa)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import joblib

le = joblib.load("/content/drive/MyDrive/DAT620/label_encoder.joblib")

# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/DAT620/german_finetuned_model_roberta"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Example text for prediction
texts = [
    "Reise mit der Bahn Handballer setzen bei Heim EM auf Nachhaltigkeit der deutsch Bahn werden offiziell Partner der Handball Europameisterschaft 2024 in Deutschland und sollen der Nationalmannschaft Fan und offiziell klimafreundlich zu der Austragungsort befÃ¶rdern.",
    "Krieg in der Ukraine entscheidend Offensive Putins perfider Plan Putin setzen darauf dass RuÃŸland der Ukraine mit noch mehr Soldat bezwingen kÃ¶nnen geht sein Kriegsplan auf oder kommen der Leopard Spitze|Panzer gerade noch rechtzeitig."
]

# Tokenize the input text
inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities and class predictions
probabilities = torch.nn.functional.softmax(logits, dim=1)
predictions = torch.argmax(probabilities, dim=1)

# Map predictions back to labels using the LabelEncoder
predicted_labels = le.inverse_transform(predictions.numpy())

# Print results
for text, label in zip(texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted Label: {label}")

Text: Reise mit der Bahn Handballer setzen bei Heim EM auf Nachhaltigkeit der deutsch Bahn werden offiziell Partner der Handball Europameisterschaft 2024 in Deutschland und sollen der Nationalmannschaft Fan und offiziell klimafreundlich zu der Austragungsort befÃ¶rdern.
Predicted Label: sport
Text: Krieg in der Ukraine entscheidend Offensive Putins perfider Plan Putin setzen darauf dass RuÃŸland der Ukraine mit noch mehr Soldat bezwingen kÃ¶nnen geht sein Kriegsplan auf oder kommen der Leopard Spitze|Panzer gerade noch rechtzeitig.
Predicted Label: politik


Predict category using the saved model (DistilBERT)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import joblib

le = joblib.load("/content/drive/MyDrive/DAT620/label_encoder.joblib")

# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/DAT620/german_finetuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Example text for prediction
texts = [
    "Reise mit der Bahn Handballer setzen bei Heim EM auf Nachhaltigkeit der deutsch Bahn werden offiziell Partner der Handball Europameisterschaft 2024 in Deutschland und sollen der Nationalmannschaft Fan und offiziell klimafreundlich zu der Austragungsort befÃ¶rdern.",
    "Krieg in der Ukraine entscheidend Offensive Putins perfider Plan Putin setzen darauf dass RuÃŸland der Ukraine mit noch mehr Soldat bezwingen kÃ¶nnen geht sein Kriegsplan auf oder kommen der Leopard Spitze|Panzer gerade noch rechtzeitig."
]

# Tokenize the input text
inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities and class predictions
probabilities = torch.nn.functional.softmax(logits, dim=1)
predictions = torch.argmax(probabilities, dim=1)

# Map predictions back to labels using the LabelEncoder
predicted_labels = le.inverse_transform(predictions.numpy())

# Print results
for text, label in zip(texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted Label: {label}")

Text: Reise mit der Bahn Handballer setzen bei Heim EM auf Nachhaltigkeit der deutsch Bahn werden offiziell Partner der Handball Europameisterschaft 2024 in Deutschland und sollen der Nationalmannschaft Fan und offiziell klimafreundlich zu der Austragungsort befÃ¶rdern.
Predicted Label: sport
Text: Krieg in der Ukraine entscheidend Offensive Putins perfider Plan Putin setzen darauf dass RuÃŸland der Ukraine mit noch mehr Soldat bezwingen kÃ¶nnen geht sein Kriegsplan auf oder kommen der Leopard Spitze|Panzer gerade noch rechtzeitig.
Predicted Label: politik
