# Models

## 1. Loading Dependencies

In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics \
import classification_report, recall_score, accuracy_score,precision_score, make_scorer,confusion_matrix
from sklearn.model_selection import GridSearchCV

import pickle

def myConfMatrix(confusion_array,labels,columns):
    df = pd.DataFrame(confusion_array, index=labels, columns=columns)
    return df

def balanceDF(df):
    violent = df.loc[df["label"] == "violent"]
    nonviolent = df.loc[df["label"] == "nonviolent"]
    violent_patched = violent.sample(nonviolent.shape[0],random_state=0)
    #balanced
    bdf = pd.concat([violent_patched,nonviolent])
    return bdf

### embeddings dependencies 

from gensim.models.fasttext import load_facebook_vectors
ft = load_facebook_vectors("cc.es.300.bin")

_token_pat = re.compile(r"[A-Za-z√Å√â√ç√ì√ö√ú√ë√°√©√≠√≥√∫√º√±']+")

def _sentence_vec_fasttext(s, ft_model):
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    toks = _token_pat.findall(s.lower())
    vecs = [ft_model[w] for w in toks if w in ft_model.key_to_index]
    return np.mean(vecs, axis=0).astype(np.float32) if vecs else np.zeros(ft_model.vector_size, dtype=np.float32)

class FastTextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, ft_model, vector_size=None):
        self.ft_model = ft_model
        self.vector_size = vector_size  # optional override

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        texts = np.asarray(X).ravel()
        vs = self.vector_size or self.ft_model.vector_size
        # ensure consistent dtype/shape
        out = np.zeros((len(texts), vs), dtype=np.float32)
        for i, s in enumerate(texts):
            out[i] = _sentence_vec_fasttext(s, self.ft_model)
        return out

###############


VIOLENT_LABEL = "violent"
NONVIOLENT_LABEL = "nonviolent"

cm_labels = ['violent', 'nonviolent']
cm_columns = ['Predicted violent', 'predicted nonviolent']



categorical_features = ["feature"]

# These were discarded as non informative. 
# numberic featuers will be ignored as they are non-informative, as can be seen on EDA.
numeric_features = ["length", "punct"]

### 1.1 Preprocessing

* Map to Violent and Non-Violent tags. 
* Create total dataset and balanced dataset. 

In [2]:
df = pd.read_csv("../master_data/data.csv")
# mapping label to 1 -> violent 0 -> non-violent
df["label"] = df["label"].apply(lambda x: VIOLENT_LABEL if x == 1 else NONVIOLENT_LABEL)
df['feature'] = df['feature'].str.replace('\xa0', ' ', regex=False)

bdf = balanceDF(df)

print("full db data:")
print(df.label.value_counts())
print("balanced db data:")
print(df.label.value_counts())

full db data:
label
violent       8689
nonviolent    3028
Name: count, dtype: int64
balanced db data:
label
violent       8689
nonviolent    3028
Name: count, dtype: int64


In [None]:
df.head()

### Split dataset into train and test. 

Note that we rebalance the test dataset.

In [3]:
from sklearn.utils import resample

data = df
X_train, X_test_full, y_train, y_test_full =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)
print(f"the shape of the entire dataset is {df.shape}")



# --- join X and y for resampling ---
test_df = X_test_full.copy()
test_df["label"] = y_test_full.values

# --- separate classes ---
violent_df     = test_df[test_df["label"] == "violent"]
nonviolent_df  = test_df[test_df["label"] == "nonviolent"]

# target size = minority count (here nonviolent is smaller: 913)
target_n = 913

# --- downsample majority to match minority ---
violent_down = resample(violent_df,     replace=False, n_samples=target_n, random_state=1)
nonviolent_k = resample(nonviolent_df,  replace=False, n_samples=target_n, random_state=1)

balanced_test = pd.concat([violent_down, nonviolent_k]).sample(frac=1, random_state=1)  # shuffle

# --- split back to X, y ---
X_test = balanced_test["feature"].to_frame()
y_test = balanced_test["label"]

print("the entire test dataset:")
print(y_test_full.value_counts())  # original imbalanced test
print("the resampled balanced test dataset:")
print(y_test.value_counts())   # now balanced 1:1


#### why do we do this ? 

### IMPORTANT EXPLANATION

# our test dataset as is is imbalanced 70-30, (violent-non violent respectively)
# there is nothing in research to suggest that the natural priors resemble these prior distributions.
# Hence validating models on an imbalanced violent dataset would artificially boost precision for identifying violent texts.
# It is safer to validate detectors on a balanced dataset. 

the shape of the entire dataset is (11717, 4)
the entire test dataset:
label
violent       2603
nonviolent     913
Name: count, dtype: int64
the resampled balanced test dataset:
label
violent       913
nonviolent    913
Name: count, dtype: int64


### Preprocess embeddings as this is costly in gridsearchCV.

In [7]:


train_texts = X_train["feature"].astype(str).tolist()
test_texts  = X_test["feature"].astype(str).tolist()
test_texts_full  = X_test_full["feature"].astype(str).tolist()

# model_name = "paraphrase-multilingual-MiniLM-L12-v2"
# st = SentenceTransformer(model_name)

# X_train_minilm = st.encode(train_texts, batch_size=64, show_progress_bar=True)
# X_test_minilm  = st.encode(test_texts,  batch_size=64, show_progress_bar=True)
# X_test_minilm_full  = st.encode(test_texts_full,  batch_size=64, show_progress_bar=True)

# X_train_minilm = np.asarray(X_train_minilm, dtype=np.float32)
# X_test_minilm  = np.asarray(X_test_minilm,  dtype=np.float32)
# X_test_minilm_full = np.asarray(X_test_minilm_full,  dtype=np.float32)

# print(X_train_minilm.shape, X_test_minilm.shape)

# np.save("X_train_minilm.npy", X_train_minilm)
# np.save("X_test_minilm.npy", X_test_minilm)
# np.save("X_test_minilm_full.npy", X_test_minilm_full)

# model_name = "sentence-transformers/LaBSE"
# st = SentenceTransformer(model_name)

# X_train_labse = st.encode(train_texts, batch_size=32, show_progress_bar=True)
# X_test_labse  = st.encode(test_texts,  batch_size=32, show_progress_bar=True)
# X_test_labse_full = st.encode(test_texts_full, batch_size=32, show_progress_bar=True)

# X_train_labse = np.asarray(X_train_labse, dtype=np.float32)
# X_test_labse  = np.asarray(X_test_labse,  dtype=np.float32)
# X_test_labse_full = np.asarray(X_test_labse_full, dtype=np.float32)

# np.save("X_train_labse.npy", X_train_labse)
# np.save("X_test_labse.npy", X_test_labse)
# np.save("X_test_labse_full.npy", X_test_labse_full)



# ft_transformer = FastTextTransformer(ft_model=ft)

# X_train_fast = ft_transformer.transform(X_train["feature"].astype(str).to_numpy())
# X_test_fast  = ft_transformer.transform(X_test["feature"].astype(str).to_numpy())
# X_test_fast_full = ft_transformer.transform(X_test_full["feature"].astype(str).to_numpy())

# X_train_fast = np.asarray(X_train_fast, dtype=np.float32)
# X_test_fast  = np.asarray(X_test_fast,  dtype=np.float32)
# X_test_fast_full = np.asarray(X_test_fast_full, dtype=np.float32)

# np.save("X_train_fast.npy", X_train_fast)
# np.save("X_test_fast.npy", X_test_fast)
# np.save("X_test_fast_full.npy", X_test_fast_full)

X_train_labse = np.load("X_train_labse.npy")
X_test_labse  = np.load("X_test_labse.npy")
X_test_labse_full  = np.load("X_test_labse_full.npy")

X_train_minilm = np.load("X_train_minilm.npy")
X_test_minilm  = np.load("X_test_minilm.npy")
X_test_minilm_full = np.load("X_test_minilm_full.npy")

X_train_fast = np.load("X_train_fast.npy")
X_test_fast  = np.load("X_test_fast.npy")
X_test_fast_full  = np.load("X_test_fast_full.npy")


### Sklearn pipelines

Below we define the sk learn pipelines which involve

Define pipelines, 
Note df is full dataset and bdf is balanced dataset

* defining preprocessors (tf-idf, embeddings, bag of words, etc)
* Defining classifiers to train (type of model )
* Defining scoring metrics
* Defining hyperparameter tuning search grids 

In [8]:

preprocessor_tfidf = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(), "feature")  # Apply TfidfVectorizer to the 'feature' column
    ]
)


classifierRF = Pipeline(
    [
        ("preprocessor",preprocessor_tfidf),
        ("rf",RandomForestClassifier(n_estimators=100,random_state=0,criterion="gini",class_weight="balanced"))
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor",preprocessor_tfidf),
        ("svc",svm.SVC(random_state=0,class_weight="balanced",probability=True))
    ]
)

classifierLR = Pipeline(
    [
        ("preprocessor",preprocessor_tfidf),
        ("logreg", LogisticRegression(solver="liblinear", random_state=0,max_iter=5000,class_weight='balanced'))
    ]
)


classifierSVC_emb = Pipeline(
    [
        ("scaler", StandardScaler()),  ##### normalizing the sparse embeddings space 
        ("svc",svm.SVC(random_state=0,kernel="linear",class_weight="balanced",probability=True))
    ]
)

classifierLR_emb = Pipeline(
    [
        ("scaler", StandardScaler()),   # normalizing the embeddings space. 
        ("logreg", LogisticRegression(solver="liblinear", random_state=0,max_iter=5000,class_weight='balanced'))
    ]
)



param_grid_svc = {
    "svc__C": [.1,1, 10, 100],
    "svc__kernel": ['linear', 'rbf', 'sigmoid'],
    "svc__gamma": ["auto","scale"],
    # "svc__class_weight":[None, 'balanced']
}

param_grid_svc_emb = {
    "svc__C": [.1,1, 5, 10],
    # "svc__kernel": ['linear', 'rbf', 'sigmoid'],  RMEMBER WHY THIS IS BAD IDEA. CHECK ON GPT WHY ITS BAD FOR TEXT. ony linear good
    # "svc__gamma": ["auto","scale"], 
    # "svc__class_weight":[None, 'balanced']
}


param_grid_rf = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__bootstrap': [True, False],
    'rf__criterion': ['gini', 'entropy'],
    # 'rf__class_weight':[None, 'balanced']
}


param_grid_logreg = {
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__solver': ['liblinear','saga'],
    # 'logreg__class_weight':[None, 'balanced']
}

# please note that given the imblanace in the training data, we use class weight balanced so we penalize more erverley the errors in the
# minority class, so our classifier is not biased towards the majority class which is violent.  



# scorer = make_scorer(accuracy_score)
scorer = make_scorer(precision_score,pos_label=VIOLENT_LABEL,average="binary",zero_division=0.0)

# we optimize for precision because a false positive is more costly than a false negative. 


# best_svc = \
# GridSearchCV(classifierSVC, param_grid_svc,n_jobs=8,cv=5,
#              scoring=scorer
# )
# best_rf = GridSearchCV(classifierRF, param_grid_rf,n_jobs=8,cv=5,
#                        scoring=scorer
#                         )
# best_logreg = GridSearchCV(classifierLR, param_grid_logreg,n_jobs=8,cv=5,scoring=scorer)

In [9]:
models_to_train = {
    'model_svc_tfidf':(classifierSVC,param_grid_svc),
    'model_rf_tfidf': (classifierRF,param_grid_rf),
    'model_logreg_tfidf':(classifierLR, param_grid_logreg),
    
    'r_model_svc_emb_fast':(classifierSVC_emb,param_grid_svc_emb, X_train_fast, y_train),
    'r_model_logreg_emb_fast':(classifierLR_emb, param_grid_logreg, X_train_fast, y_train),
    
    'r_model_svc_emb_minilm':(classifierSVC_emb,param_grid_svc_emb, X_train_minilm, y_train),
    'r_model_logreg_emb_minilm':(classifierLR_emb, param_grid_logreg, X_train_minilm, y_train),

    'r_model_svc_emb_labse':(classifierSVC_emb,param_grid_svc_emb, X_train_labse, y_train),
    'r_model_logreg_emb_labse':(classifierLR_emb, param_grid_logreg, X_train_labse, y_train),

    
}

In [None]:
# trained_models = {}

# for model_name,model_logic in models_to_train.items():
#     classifier, param_grid = model_logic
#     best_model = GridSearchCV(classifier, param_grid,n_jobs=8,cv=5,scoring=scorer)
#     best_model.fit(X_train, y_train)
#     trained_models[model_name] = best_model


In [16]:
import time

trained_models = {}

for model_name, (classifier, param_grid, X_tr, y_tr) in models_to_train.items():
    

    print(f"\nüîÑ Starting: {model_name}")
    start = time.time()

    best_model = GridSearchCV(
        classifier,
        param_grid,
        n_jobs=4,
        cv=5,
        scoring=scorer,
        verbose=2,          # shows progress inside the grid
        return_train_score=False
    )

    best_model.fit(X_tr, y_tr)

    elapsed = time.time() - start
    mins = int(elapsed // 60)
    secs = int(elapsed % 60)

    print(f"‚úÖ Finished: {model_name} in {mins}m {secs}s")
    print(f"   Best CV score: {best_model.best_score_:.4f}")
    print(f"   Best params: {best_model.best_params_}")

    trained_models[model_name] = best_model


üîÑ Starting: r_model_svc_emb_fast
Fitting 5 folds for each of 4 candidates, totalling 20 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Finished: r_model_svc_emb_fast in 12m 58s
   Best CV score: 0.9553
   Best params: {'svc__C': 0.1}

üîÑ Starting: r_model_logreg_emb_fast
Fitting 5 folds for each of 16 candidates, totalling 80 fits
‚úÖ Finished: r_model_logreg_emb_fast in 2m 19s
   Best CV score: 0.9644
   Best params: {'logreg__C': 0.01, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}

üîÑ Starting: r_model_svc_emb_minilm
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END .........................................svc__C=0.1; total time=  17.3s
[CV] END ...........................................svc__C=1; total time=  34.1s
[CV] END ...........................................svc__C=1; total time=  30.4s
[CV] END ...........................................svc__C=5; total time= 2.4min
[CV] END ..........................................svc__C=10; total time= 4.9min
[CV] END logreg__C=0.01, logreg__penalty=l1, logreg__solver=liblinear; total time=   0.1s
[CV] END logreg__C=0.01, logreg__pena

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


‚úÖ Finished: r_model_logreg_emb_minilm in 1m 9s
   Best CV score: 0.9607
   Best params: {'logreg__C': 0.01, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}

üîÑ Starting: r_model_svc_emb_labse
Fitting 5 folds for each of 4 candidates, totalling 20 fits
‚úÖ Finished: r_model_svc_emb_labse in 6m 17s
   Best CV score: 0.9554
   Best params: {'svc__C': 0.1}

üîÑ Starting: r_model_logreg_emb_labse
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END logreg__C=0.01, logreg__penalty=l1, logreg__solver=saga; total time=   0.6s
[CV] END logreg__C=0.01, logreg__penalty=l2, logreg__solver=saga; total time=   0.5s
[CV] END logreg__C=0.01, logreg__penalty=l2, logreg__solver=saga; total time=   0.6s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=saga; total time=   2.8s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=saga; total time=   2.5s
[CV] END logreg__C=1, logreg__penalty=l1, logreg__solver=liblinear; total time=   1.7s
[CV] END logreg__C=1,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[CV] END logreg__C=0.01, logreg__penalty=l1, logreg__solver=liblinear; total time=   0.1s
[CV] END logreg__C=0.01, logreg__penalty=l2, logreg__solver=liblinear; total time=   0.3s
[CV] END logreg__C=0.01, logreg__penalty=l2, logreg__solver=liblinear; total time=   0.3s
[CV] END logreg__C=0.01, logreg__penalty=l2, logreg__solver=saga; total time=   0.5s
[CV] END logreg__C=0.01, logreg__penalty=l2, logreg__solver=saga; total time=   0.5s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=saga; total time=   2.8s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=saga; total time=   2.2s
[CV] END logreg__C=1, logreg__penalty=l1, logreg__solver=liblinear; total time=   1.7s
[CV] END logreg__C=1, logreg__penalty=l1, logreg__solver=liblinear; total time=   1.6s
[CV] END logreg__C=1, logreg__penalty=l1, logreg__solver=saga; total time=   8.0s
[CV] END logreg__C=1, logreg__penalty=l1, logreg__solver=saga; total time=   9.0s
[CV] END logreg__C=1, logreg__penalty=l2, logreg__solv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[CV] END logreg__C=0.01, logreg__penalty=l1, logreg__solver=saga; total time=   1.1s
[CV] END logreg__C=0.01, logreg__penalty=l1, logreg__solver=saga; total time=   1.1s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=liblinear; total time=   0.5s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=liblinear; total time=   0.5s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=liblinear; total time=   0.5s
[CV] END logreg__C=0.1, logreg__penalty=l1, logreg__solver=saga; total time=   5.5s
[CV] END logreg__C=0.1, logreg__penalty=l2, logreg__solver=liblinear; total time=   1.2s
[CV] END logreg__C=0.1, logreg__penalty=l2, logreg__solver=liblinear; total time=   1.2s
[CV] END logreg__C=0.1, logreg__penalty=l2, logreg__solver=saga; total time=   4.1s
[CV] END logreg__C=1, logreg__penalty=l1, logreg__solver=liblinear; total time=   3.5s
[CV] END logreg__C=1, logreg__penalty=l1, logreg__solver=liblinear; total time=   3.4s
[CV] END logreg__C=1, logreg__penalty=l1, l

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


‚úÖ Finished: r_model_logreg_emb_labse in 5m 22s
   Best CV score: 0.9695
   Best params: {'logreg__C': 0.01, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}


In [None]:
len(trained_models)

In [17]:
# Save the model
for model_name,model in trained_models.items():
    filename = f'{model_name}.pkl'

    #save model
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Load the model
# with open(filename, 'rb') as file:
#     loaded_model = pickle.load(file)

# # Use the loaded model
# predictions = loaded_model.predict(X)

#### List models performance.

In [19]:

def get_X_test(model_name, full=False):
    """
    Returns the correct test matrix depending on:
    - model_name (tfidf / fast / minilm / labse)
    - whether full test set is requested
    """

    if "minilm" in model_name:
        return X_test_minilm_full if full else X_test_minilm

    if "labse" in model_name:
        return X_test_labse_full if full else X_test_labse

    if "fast" in model_name:
        return X_test_fast_full if full else X_test_fast

    # Default: TF-IDF models use raw dataframe
    return X_test_full if full else X_test

model_files = [
    f for f in os.listdir(".")
    if f.endswith("pkl") and os.path.isfile(f)
]

for model_file in model_files:
    
    model_name = model_file.replace(".pkl","")
    print(f"\n#### MODEL: {model_name}")

    with open(model_file, 'rb') as file:
        model = pickle.load(file)
        print(f"\nBest params were :{model.best_params_}")

    #checking performance on training data
    
    # y_pred_model_train = model.predict(X_train)
    # report_model_train = classification_report(y_train, y_pred_model_train)
    
    # checking performance on prod data.

    #TODO HOW DO YOU CONSIDER TEST_BALANCED_DATASET FOR BELOW ? 

    balanced_test_dataset = True
    
    test_features = get_X_test(model_name, full=(not balanced_test_dataset))
    test_labels = y_test if balanced_test_dataset else y_test_full



    y_pred_model = model.predict(test_features)
    report_model = classification_report(test_labels, y_pred_model)
    report_dict = classification_report(test_labels, y_pred_model, output_dict=True)
    print(f"\nThe model {model_name} metrics: \n")
    print(f"Precision: {report_dict['violent']['precision']:.2f}, Recall: {report_dict['violent']['recall']:.2f} \n")
    print("the model report: \n")
    print(report_model)
    print("\n")
    




#### MODEL: model_logreg_tfidf

Best params were :{'logreg__C': 1, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}

The model model_logreg_tfidf metrics: 

Precision: 0.91, Recall: 0.82 

the model report: 

              precision    recall  f1-score   support

  nonviolent       0.84      0.92      0.88       913
     violent       0.91      0.82      0.86       913

    accuracy                           0.87      1826
   macro avg       0.87      0.87      0.87      1826
weighted avg       0.87      0.87      0.87      1826




#### MODEL: r_model_logreg_emb_fast

Best params were :{'logreg__C': 0.01, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}

The model r_model_logreg_emb_fast metrics: 

Precision: 0.91, Recall: 0.87 

the model report: 

              precision    recall  f1-score   support

  nonviolent       0.87      0.92      0.90       913
     violent       0.91      0.87      0.89       913

    accuracy                           0.89      1826
   mac

### Discussion on Model rankings and results

1. labse embeddings preprocessing for both svc and logreg seem to be the best models.
2. Dataset is imbalanced, there is no prior info that suggests prior distributions in reality. Balancing the training dataset would have made the models incur in information-signal loss. Nonetheless by using the parameter `class_weight=balanced` we are able to adjust loss with weighting, instead of undersampling/information loss.
3. For the evaluation dataset, the imablanced test dataset boosts precision more. Model rankings stay the same though if using balanced evaluation dataset, but precision is lower. Recall holds. So...labse is indeed the best preprocessing.
4. We should threshold tuned the probabilities for both top wo models, the labse ones, and see which one boosts more precision without punishing recall a lot.

Notes:
In your report:

Present balanced test metrics as primary comparison.

Present imbalanced metrics as operational view.

Emphasize ranking stability.

Because your ranking stability is a very strong result.

GOTCHAS: 

1. WHY LARGE C WAS BAD IN SVC WITH EMBEDDINGS

In linear SVM, we optimize:

1
2
‚à£
‚à£
ùë§
‚à£
‚à£
2
+
ùê∂
‚àë
ùúâ
ùëñ
2
1
	‚Äã

‚à£‚à£w‚à£‚à£
2
+C‚àëŒæ
i
	‚Äã


where:

‚à£
‚à£
ùë§
‚à£
‚à£
2
‚à£‚à£w‚à£‚à£
2
 controls margin size (regularization)

ùê∂
C controls penalty for misclassification

Small C ‚Üí stronger regularization ‚Üí larger margin ‚Üí smoother boundary ‚Üí faster training
Large C ‚Üí weaker regularization ‚Üí tighter margin ‚Üí tries to fit all points ‚Üí slower convergence

In high-dimensional text embeddings (e.g., FastText, MiniLM, LaBSE), classes are often nearly linearly separable. Therefore, small C values (0.1‚Äì1) usually generalize better and train much faster, while large C (10, 100) increases training time dramatically with little or no gain.

2. WHY IT MADE NO SENSE TO TRY OTHER KERNELS IN SVC WITH EMBEDDINGS

Why we use linear SVM (and not RBF/sigmoid) for text embeddings:

Embeddings (FastText, MiniLM, LaBSE) are already nonlinear transformations learned by deep models. They map text into a semantically structured space.

In this high-dimensional space, classes are often nearly linearly separable. A linear decision boundary is usually sufficient.

Nonlinear kernels (RBF, sigmoid):

Increase computational cost significantly (O(n¬≤) or worse)

Risk overfitting

Rarely improve performance for NLP embeddings

Therefore, LinearSVC or LogisticRegression is typically faster, more stable, and just as effective for text classification with embeddings.