In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [50]:


# Load the main dataset
main_df = pd.read_csv("dataset.csv", index_col=0)

# Load and process dataset_classification_with_hermes.csv
df_hermes = pd.read_csv("dataset_classification_with_hermes.csv")
df_hermes.rename(columns={"llama": "Hermes-3-Llama-3.1-70B-Q5_K_S"}, inplace=True)

# Load and process dataset_classification_with_llama.csv
df_llama = pd.read_csv("dataset_classification_with_llama.csv")
df_llama.rename(columns={"llama": "llama3.3:70B-Instruct-Q2_K"}, inplace=True)

# Load and process dataset_classification_with_llama_Q6.csv
df_llama_q6 = pd.read_csv("dataset_classification_with_llama_Q6.csv")
df_llama_q6.rename(columns={
    "llama_aman": "llama3.3:70b-instruct-q6_K-SEEN_DATA",
    "llama_Q6": "llama3.3:70b-instruct-q6_K"
}, inplace=True)

# Keep only the renamed columns in df_llama_q6
df_llama_q6 = df_llama_q6[["llama3.3:70b-instruct-q6_K-SEEN_DATA", "llama3.3:70b-instruct-q6_K"]]

# Load and process dataset_classification_with_mistral.csv
df_mistral = pd.read_csv("dataset_classification_with_mistral.csv")
df_mistral.rename(columns={"mistral_cleaned": "mistral:7b-instruct-v0.2-q8_0"}, inplace=True)

# Merge datasets with the main dataset in the specified order
main_df = main_df.join(df_hermes["Hermes-3-Llama-3.1-70B-Q5_K_S"], how="left")
main_df = main_df.join(df_llama["llama3.3:70B-Instruct-Q2_K"], how="left")
main_df = main_df.join(df_llama_q6, how="left")
main_df = main_df.join(df_mistral["mistral:7b-instruct-v0.2-q8_0"], how="left")

In [51]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20148 entries, 0 to 20147
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   post_id                               20148 non-null  object
 1   tweet_text                            20148 non-null  object
 2   key_features                          20148 non-null  object
 3   target                                16230 non-null  object
 4   label                                 20148 non-null  object
 5   annotator_1_label                     20148 non-null  object
 6   annotator_1_target                    13021 non-null  object
 7   annotator_2_label                     20148 non-null  object
 8   annotator_2_target                    13151 non-null  object
 9   annotator_3_label                     20148 non-null  object
 10  annotator_3_target                    13080 non-null  object
 11  Hermes-3-Llama-3.1-70B-Q5_K_S    

In [52]:
columns_to_drop = [
    "annotator_1_label", "annotator_1_target",
    "annotator_2_label", "annotator_2_target",
    "annotator_3_label", "annotator_3_target"
]

# Drop the specified columns
main_df = main_df.drop(columns=columns_to_drop)


In [53]:
main_df.head()

Unnamed: 0,post_id,tweet_text,key_features,target,label,Hermes-3-Llama-3.1-70B-Q5_K_S,llama3.3:70B-Instruct-Q2_K,llama3.3:70b-instruct-q6_K-SEEN_DATA,llama3.3:70b-instruct-q6_K,mistral:7b-instruct-v0.2-q8_0
0,1179055004553900032_twitter,i dont think im getting my baby them white 9 h...,[],,normal,offensive,Normal,Normal,normal,normal
1,1179063826874032128_twitter,we cannot continue calling ourselves feminists...,[],,normal,normal,normal,normal,normal,offensive
2,1178793830532956161_twitter,nawt yall niggers ignoring me,[],"African, None",normal,hate speech,hate speech,hate speech,hate speech,offensive
3,1179088797964763136_twitter,<user> i am bit confused coz chinese ppl can n...,"['ching', 'chong']",Asian,hatespeech,offensive,hate speech,Hate speech,hate speech,offensive
4,1179085312976445440_twitter,this bitch in whataburger eating a burger with...,"['bitch', 'i', 'hate', 'white', 'bitches']","Caucasian, Women",hatespeech,hate speech,hate speech,Hate Speech,hate speech,offensive


In [54]:
# Normalize labels for all models and the label column
normalization_map = {
    "offensive": "offensive",
    "Offensive": "offensive",
    "hate speech": "hatespeech",
    "Hatespeech": "hatespeech",
    "hatespeech": "hatespeech",
    "Hate Speech": "hatespeech",
    "Hate speech": "hatespeech",
    "normal": "normal",
    "Normal": "normal",
    "Normal.": "normal",
    "Hate speech.": "hatespeech",
    "hate speech.": "hatespeech",
    "hatedspeech": "hatespeech",
    "hatemspeech": "hatespeech",
    "haterspeech": "hatespeech",
}

columns_to_normalize = [
    "label",
    "Hermes-3-Llama-3.1-70B-Q5_K_S",
    "llama3.3:70B-Instruct-Q2_K",
    "llama3.3:70b-instruct-q6_K-SEEN_DATA",
    "llama3.3:70b-instruct-q6_K",
    "mistral:7b-instruct-v0.2-q8_0"
]

for column in columns_to_normalize:
    main_df[column] = main_df[column].str.strip('"').replace(normalization_map)
    main_df[column] = main_df[column].replace(normalization_map)


In [55]:
# check if data is clean and normalized
# for column in columns_to_normalize:
#     print(f"Value counts for {column}:")
#     print(main_df[column].value_counts().head(6))
#     print()

In [56]:
# Calculate and print accuracy for each model
label_column = "label"
models = [
    "Hermes-3-Llama-3.1-70B-Q5_K_S",
    "llama3.3:70B-Instruct-Q2_K",
    "llama3.3:70b-instruct-q6_K-SEEN_DATA",
    "llama3.3:70b-instruct-q6_K",
    "mistral:7b-instruct-v0.2-q8_0"
]

for model in models:
    accuracy = (main_df[label_column] == main_df[model]).mean() * 100
    print(f"Accuracy for {model}: {accuracy:.2f}%")

Accuracy for Hermes-3-Llama-3.1-70B-Q5_K_S: 50.47%
Accuracy for llama3.3:70B-Instruct-Q2_K: 43.48%
Accuracy for llama3.3:70b-instruct-q6_K-SEEN_DATA: 43.15%
Accuracy for llama3.3:70b-instruct-q6_K: 49.32%
Accuracy for mistral:7b-instruct-v0.2-q8_0: 44.52%


In [57]:
import pickle

# Load using pickle
with open("C:\\MachineLearning\\UniTrier\\RCS\\dataset_mit_embeddings_sfr.pkl", "rb") as f:
    df = pickle.load(f)

In [58]:
main_df = main_df.join(df["X_train"], how="left")

In [59]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20148 entries, 0 to 20147
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   post_id                               20148 non-null  object
 1   tweet_text                            20148 non-null  object
 2   key_features                          20148 non-null  object
 3   target                                16230 non-null  object
 4   label                                 20148 non-null  object
 5   Hermes-3-Llama-3.1-70B-Q5_K_S         20148 non-null  object
 6   llama3.3:70B-Instruct-Q2_K            20148 non-null  object
 7   llama3.3:70b-instruct-q6_K-SEEN_DATA  20148 non-null  object
 8   llama3.3:70b-instruct-q6_K            20148 non-null  object
 9   mistral:7b-instruct-v0.2-q8_0         20148 non-null  object
 10  X_train                               20148 non-null  object
dtypes: object(11)
memory usage: 1.8+ 

In [82]:
df = main_df

In [85]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Convert 'X_train' column to a list of embeddings
X = df['X_train'].tolist()
y = df['label_encoded'].tolist()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Use 'multi:softmax' for multi-class classification
    num_class=len(label_encoder.classes_),  # Number of classes
    eval_metric='mlogloss',  # Multi-class log loss
    use_label_encoder=False  # Avoid warnings about label encoding
)


# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.6531

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.73      0.73      0.73      1285
      normal       0.66      0.77      0.71      1610
   offensive       0.52      0.40      0.45      1135

    accuracy                           0.65      4030
   macro avg       0.64      0.63      0.63      4030
weighted avg       0.64      0.65      0.64      4030



In [89]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Assuming `df` is your main DataFrame

# Step 1: Train-test split (whole dataset)
TRAIN_DF, TEST_DF = train_test_split(df, test_size=0.2, random_state=42)

# Step 2: Prepare the labels (encoded labels for classification)
label_encoder = LabelEncoder()
TRAIN_DF['label_encoded'] = label_encoder.fit_transform(TRAIN_DF['label'])
TEST_DF['label_encoded'] = label_encoder.transform(TEST_DF['label'])

# Step 3: Convert 'X_train' column to a list of embeddings
X_train = TRAIN_DF['X_train'].tolist()
X_test = TEST_DF['X_train'].tolist()
y_train = TRAIN_DF['label_encoded'].tolist()
y_test = TEST_DF['label_encoded'].tolist()

# Step 4: Initialize and train XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Use 'multi:softmax' for multi-class classification
    num_class=len(label_encoder.classes_),  # Number of classes
    eval_metric='mlogloss',  # Multi-class log loss
    use_label_encoder=False  # Avoid warnings about label encoding
)

# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Step 5: Make predictions with XGBoost
y_pred_xgb = xgb_model.predict(X_test)

# Step 6: Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")

# Detailed classification report for XGBoost
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))

# Step 7: Evaluate other classifiers' predictions against 'label'
other_classifier_columns = [
    "Hermes-3-Llama-3.1-70B-Q5_K_S",
    "llama3.3:70B-Instruct-Q2_K",
    "llama3.3:70b-instruct-q6_K-SEEN_DATA",
    "llama3.3:70b-instruct-q6_K",
    "mistral:7b-instruct-v0.2-q8_0"
]

# Convert string predictions from other classifiers to label-encoded integers
for column in other_classifier_columns:
    # Label encode the predicted string values
    y_pred_other = label_encoder.transform(TEST_DF[column].astype(str))

    # Accuracy for this classifier's predictions
    accuracy_other = accuracy_score(y_test, y_pred_other)
    print(f"\nAccuracy for {column}: {accuracy_other:.4f}")

    # Detailed classification report for each other classifier
    print(f"\n{column} Classification Report:")
    print(classification_report(y_test, y_pred_other, target_names=label_encoder.classes_))


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.6531

XGBoost Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.73      0.73      0.73      1285
      normal       0.66      0.77      0.71      1610
   offensive       0.52      0.40      0.45      1135

    accuracy                           0.65      4030
   macro avg       0.64      0.63      0.63      4030
weighted avg       0.64      0.65      0.64      4030



ValueError: y contains previously unseen labels: 'confused'

In [91]:
# Convert string predictions from other classifiers to label-encoded integers
for column in other_classifier_columns:
    # Initialize a list to hold the valid label-encoded predictions
    y_pred_other = []

    for pred in TEST_DF[column].astype(str):
        try:
            # Try to transform the prediction into an encoded label
            y_pred_other.append(label_encoder.transform([pred])[0])
        except ValueError:
            # Handle unseen label: treat it as a default value (e.g., -1)
            y_pred_other.append(-1)

    y_pred_other = np.array(y_pred_other)

    # Remove the '-1' values (unseen labels) from both y_pred_other and y_test
    valid_indices = (y_pred_other != -1) & (y_test != -1)
    y_pred_other_valid = y_pred_other[valid_indices]
    y_test_valid = np.array(y_test)[valid_indices]

    # Accuracy for this classifier's predictions
    accuracy_other = accuracy_score(y_test_valid, y_pred_other_valid)
    print(f"\nAccuracy for {column}: {accuracy_other:.4f}")

    # Detailed classification report for each other classifier
    print(f"\n{column} Classification Report:")
    print(classification_report(y_test_valid, y_pred_other_valid, target_names=label_encoder.classes_))


Accuracy for Hermes-3-Llama-3.1-70B-Q5_K_S: 0.5062

Hermes-3-Llama-3.1-70B-Q5_K_S Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.48      0.89      0.62      1285
      normal       0.76      0.38      0.50      1608
   offensive       0.34      0.26      0.30      1135

    accuracy                           0.51      4028
   macro avg       0.53      0.51      0.47      4028
weighted avg       0.55      0.51      0.48      4028


Accuracy for llama3.3:70B-Instruct-Q2_K: 0.4461

llama3.3:70B-Instruct-Q2_K Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.42      0.95      0.58      1284
      normal       0.88      0.23      0.36      1609
   offensive       0.31      0.19      0.23      1135

    accuracy                           0.45      4028
   macro avg       0.53      0.45      0.39      4028
weighted avg       0.57      0.45      0.39      4028


Accuracy for llama3.3:70b-instr

In [74]:
main_df['label_encoded'] = label_encoder.fit_transform(main_df['label'])

In [75]:
X = main_df.drop(columns=['post_id', 'tweet_text', 'key_features', 'target', 'label'])
y = main_df['label_encoded']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16118 entries, 2394 to 15795
Data columns (total 7 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Hermes-3-Llama-3.1-70B-Q5_K_S         16118 non-null  object
 1   llama3.3:70B-Instruct-Q2_K            16118 non-null  object
 2   llama3.3:70b-instruct-q6_K-SEEN_DATA  16118 non-null  object
 3   llama3.3:70b-instruct-q6_K            16118 non-null  object
 4   mistral:7b-instruct-v0.2-q8_0         16118 non-null  object
 5   X_train                               16118 non-null  object
 6   label_encoded                         16118 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1007.4+ KB


In [73]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

def train_model(data: pd.DataFrame, labels: pd.Series):
    if torch.cuda.is_available():
        boost_device = "cuda"
    else:
        boost_device = "cpu"

    # Initialize the XGBoost Classifier
    xgb_clf = xgb.XGBClassifier(objective="binary:logistic",
                                device=boost_device,
                                random_state=3137)

    # Define hyperparameters and values to tune
    param_grid = {
        'max_depth': [5, 6, 7, 8],
        'eta': np.arange(0.05, 0.3, 0.05)
    }

    print(f"Number of rows in training data: {len(data)}")

    # Perform hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring="roc_auc",
                               cv=5, verbose=3)
    grid_search.fit(data, labels)

    # Get the best hyperparameters
    best_max_depth = grid_search.best_params_['max_depth']
    best_eta = grid_search.best_params_['eta']

    final_xgb_clf = xgb.XGBClassifier(objective="binary:logistic",
                                      max_depth=best_max_depth,
                                      eta=best_eta,
                                      device=boost_device,
                                      random_state=3137)
    final_xgb_clf.fit(data, labels)

    return final_xgb_clf

# Ensure the target is numeric
y = main_df['label_encoded']

# If 'target' is categorical (strings), convert it to numeric labels
if y.dtypes == 'object':  
    le = LabelEncoder()
    y = le.fit_transform(y)

# Use the embeddings (ensure they are numeric)
X = main_df[['X_train']]  # Use the embeddings as the features

# Train the model
final_model = train_model(X, y)

# Make predictions on test data
y_preds = final_model.predict(X)

# Accuracy of the model
accuracy = accuracy_score(y, y_preds)
print(f"Model Accuracy: {accuracy:.4f}")


Number of rows in training data: 20148
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ...............eta=0.05, max_depth=5;, score=nan total time=   0.0s
[CV 2/5] END ...............eta=0.05, max_depth=5;, score=nan total time=   0.0s
[CV 3/5] END ...............eta=0.05, max_depth=5;, score=nan total time=   0.0s
[CV 4/5] END ...............eta=0.05, max_depth=5;, score=nan total time=   0.0s
[CV 5/5] END ...............eta=0.05, max_depth=5;, score=nan total time=   0.0s
[CV 1/5] END ...............eta=0.05, max_depth=6;, score=nan total time=   0.0s
[CV 2/5] END ...............eta=0.05, max_depth=6;, score=nan total time=   0.0s
[CV 3/5] END ...............eta=0.05, max_depth=6;, score=nan total time=   0.0s
[CV 4/5] END ...............eta=0.05, max_depth=6;, score=nan total time=   0.0s
[CV 5/5] END ...............eta=0.05, max_depth=6;, score=nan total time=   0.0s
[CV 1/5] END ...............eta=0.05, max_depth=7;, score=nan total time=   0.0s
[CV 2/5]



ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1233 1234 1235], got [   0    1    2 ... 1382 1383 1384]

--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1232 1233 1234], got [   0    1    2 ... 1382 1383 1384]

--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1225 1226 1227], got [   0    1    2 ... 1382 1383 1384]

--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1210 1211 1212], got [   0    1    2 ... 1382 1383 1384]

--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\MachineLearning\AICU\recommendation_system\.venv\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1228 1229 1230], got [   0    1    2 ... 1382 1383 1384]


In [None]:
from sklearn.preprocessing import LabelEncoder