### Modelos a entrenar

1. M√°quinas de soporte vectorial SVM
2. Bosques Aleatorios RF
3. Regresi√≥n Logistica

In [13]:
from pprint import pprint
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics \
import classification_report, recall_score, accuracy_score,precision_score, make_scorer,confusion_matrix
from sklearn.model_selection import GridSearchCV

import pickle

def myConfMatrix(confusion_array,labels,columns):
    df = pd.DataFrame(confusion_array, index=labels, columns=columns)
    return df

def balanceDF(df,labels_dict):
    violent = df.loc[df["label"] == labels_dict["VIOLENT"]]
    nonviolent = df.loc[df["label"] == labels_dict["NONVIOLENT"]]
    violent_patched = violent.sample(nonviolent.shape[0],random_state=0)
    #balanced
    bdf = pd.concat([violent_patched,nonviolent])
    return bdf


cm_labels = ['violent', 'nonviolent']
cm_columns = ['Predicted violent', 'predicted nonviolent']

* Map to Violent and Non-Violent tags. 
* Create total dataset and balanced dataset. 

In [147]:


labels_dict = {
    "VIOLENT": "violent",
    "NONVIOLENT":"nonviolent"
}

df = pd.read_csv("../master_data/data.csv")
# mapping label to 1 -> violent 0 -> non-violent
df["label"] = df["label"].apply(lambda x: labels_dict["VIOLENT"] if x == 1 else labels_dict["NONVIOLENT"])
df['feature'] = df['feature'].str.replace('\xa0', ' ', regex=False)



# df["word_count"] = df["feature"].str.split().str.len()
# df["word_count"].value_counts().sort_index()

In [148]:
df.head()

Unnamed: 0,feature,label,tweet_id,origin
0,es sexy.,violent,,rnn
1,eres m√≠a.,violent,,rnn
2,la gorda.,violent,,rnn
3,a fregar.,violent,,rnn
4,ca- gona.,violent,,rnn


In [4]:
df.shape

(11717, 4)

In [149]:
df.label.value_counts()

label
violent       8689
nonviolent    3028
Name: count, dtype: int64

In [6]:
bdf.label.value_counts()

label
violent       3028
nonviolent    3028
Name: count, dtype: int64

Define pipelines, 
Note df is full dataset and bdf is balanced dataset

In [141]:

categorical_features = ["feature"]

numeric_features = ["length", "punct"]

# numberic featuers will be ignored as they are non-informative, as can be seen on EDA.

    # numeric_transformer = Pipeline(
    #     steps=[("scaler", StandardScaler())]
    # )
    
    # categorical_transformer = Pipeline(
    #     steps=[
    #         ("squeeze", FunctionTransformer(lambda x: x.squeeze(),validate=True)), # make sure you pass a series
    #         ("tfidf",TfidfVectorizer())
    #     ]
    # )
    
    # preprocessor = ColumnTransformer(
    #     transformers=[
    #         # ("num", numeric_transformer, numeric_features),
    #         ("cat", categorical_transformer, categorical_features)
    #     ]
    # )

preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(), "feature")  # Apply TfidfVectorizer to the 'feature' column
    ]
)

classifierRF = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("rf",RandomForestClassifier(n_estimators=100,random_state=0,criterion="gini",class_weight="balanced"))
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("svc",svm.SVC(random_state=0,class_weight="balanced"))
    ]
)

classifierLR = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("logreg", LogisticRegression(solver="liblinear", random_state=0,max_iter=100,class_weight='balanced'))
    ]
)



param_grid_svc = {
    "svc__C": [.1,1, 10, 100],
    "svc__kernel": ['linear', 'rbf', 'sigmoid'],
    "svc__gamma": ["auto","scale"],
    "svc__class_weight":[None, 'balanced']
}


param_grid_rf = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__bootstrap': [True, False],
    'rf__criterion': ['gini', 'entropy'],
    'rf__class_weight':[None, 'balanced']
}


param_grid_logreg = {
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__solver': ['liblinear','saga'],
    'logreg__class_weight':[None, 'balanced']
}

# please note that given the imblanace in the training data, we use clss weight balanced so we penalize more erverley the errors in the
# minority class, so our classifier is not biased towards the majority class which is violent. 



# scorer = make_scorer(accuracy_score)
scorer = make_scorer(precision_score,pos_label=labels_dict["VIOLENT"],average="binary",zero_division=0.0)

# we optimize for precision because a false positive is more costly than a false negative. 


# best_svc = \
# GridSearchCV(classifierSVC, param_grid_svc,n_jobs=8,cv=5,
#              scoring=scorer
# )
# best_rf = GridSearchCV(classifierRF, param_grid_rf,n_jobs=8,cv=5,
#                        scoring=scorer
#                         )
# best_logreg = GridSearchCV(classifierLR, param_grid_logreg,n_jobs=8,cv=5,scoring=scorer)

In [None]:
# condition = np.logical_or(bdf["feature"].str.contains("robamaridos"), bdf["feature"].str.contains("ella tiene que pedirle permiso"))


# corpus = bdf.loc[condition,["feature"]].squeeze()
# print(corpus.head())
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names_out())

# print(X.shape)
# X.toarray()

# transformed_data = preprocessor.fit_transform(X_train)
# transformed_data.toarray()[188,:]

In [43]:
bdf.label.value_counts()

label
violent       3028
nonviolent    3028
Name: count, dtype: int64

In [140]:
from sklearn.utils import resample

data = df
X_train, X_test_full, y_train, y_test_full =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)
print(df.shape)
print(y_test_full.value_counts())

# --- join X and y for resampling ---
test_df = X_test_full.copy()
test_df["label"] = y_test_full.values

# --- separate classes ---
violent_df     = test_df[test_df["label"] == "violent"]
nonviolent_df  = test_df[test_df["label"] == "nonviolent"]

# target size = minority count (here nonviolent is smaller: 913)
target_n = 913

# --- downsample majority to match minority ---
violent_down = resample(violent_df,     replace=False, n_samples=target_n, random_state=1)
nonviolent_k = resample(nonviolent_df,  replace=False, n_samples=target_n, random_state=1)

balanced_test = pd.concat([violent_down, nonviolent_k]).sample(frac=1, random_state=1)  # shuffle

# --- split back to X, y ---
X_test = balanced_test["feature"]
y_test = balanced_test["label"]

print(y_test_full.value_counts())  # original imbalanced test
print(y_test.value_counts())   # now balanced 1:1


#### why do we do this ? 

### IMPORTANT EXPLANATION

# our test dataset as is is imbalanced 70-30, (violent-non violent respectively)
# there is nothing in research to suggest that the natural priors resemble these prior distributions.
# Hence validating models on an imbalanced violent dataset would artificially boost precision for identifying violent texts.
# It is safer to validate detectors on a balanced dataset. 


(11717, 4)
label
violent       2603
nonviolent     913
Name: count, dtype: int64
label
violent       2603
nonviolent     913
Name: count, dtype: int64
label
violent       913
nonviolent    913
Name: count, dtype: int64


(6056, 4)

In [142]:
models_to_train = {
    'svc':(classifierSVC,param_grid_svc),
    'rf': (classifierRF,param_grid_rf),
    'logreg':(classifierLR, param_grid_logreg)
}


In [143]:
trained_models = {}

for model_name,model_logic in models_to_train.items():
    classifier, param_grid = model_logic
    best_model = GridSearchCV(classifier, param_grid,n_jobs=8,cv=5,scoring=scorer)
    best_model.fit(X_train, y_train)
    trained_models[model_name] = best_model





In [144]:
import pickle
# for model_name,model in trained_models.items():
#     filename = f'{model_name}_all2.pkl'
#     with open(filename, 'wb') as file:
#         pickle.dump(model, file)

In [12]:
# Save the model
import pickle
# for model_name,model in trained_models.items():
#     filename = f'{model_name}_bdf.pkl'
#     with open(filename, 'wb') as file:
#         pickle.dump(model, file)

# # Load the model
# with open(filename, 'rb') as file:
#     loaded_model = pickle.load(file)

# # Use the loaded model
# predictions = loaded_model.predict(X)

In [135]:
X_test.to_frame()

Unnamed: 0,feature
2401,Pregunta feminista seria: si una mujer es mas ...
4236,Uno deber√≠a estar siempre enamorado. Por eso j...
4545,Dios es para los hombres y la religi√≥n para la...
1885,Las mujeres escuchen en silencio las instrucci...
6785,"el cuerpo de la mujer y el calz√≥n , y maldita ..."
...,...
11571,üî¥#ACTUALIZACI√ìN Greta Thunberg se presenta por...
3705,"La ciencia se compone de errores, que a su vez..."
4825,En los inicios de un amor los amantes hablan d...
6400,que puta.


In [145]:
dataset_type = "all2"

for model_name,_ in models_to_train.items():
    print(f"#######MODEL: {model_name}")
    with open(model_name+f"_{dataset_type}.pkl", 'rb') as file:
        model = pickle.load(file)
    print(f"Best parameters to evaluate on (CV score={model.best_score_:.3f}):")
    print(f"Best params were :{model.best_params_}")
    y_pred_model = model.predict(X_test.to_frame())
    y_pred_model_train = model.predict(X_train)
    report_model_train = classification_report(y_train, y_pred_model_train)
    # print(report_model_train)
    report_model = classification_report(y_test, y_pred_model)
    print(report_model)

#######MODEL: svc
Best parameters to evaluate on (CV score=0.946):
Best params were :{'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale', 'svc__kernel': 'sigmoid'}
              precision    recall  f1-score   support

  nonviolent       0.90      0.88      0.89       913
     violent       0.88      0.91      0.89       913

    accuracy                           0.89      1826
   macro avg       0.89      0.89      0.89      1826
weighted avg       0.89      0.89      0.89      1826

#######MODEL: rf
Best parameters to evaluate on (CV score=0.973):
Best params were :{'rf__bootstrap': False, 'rf__class_weight': 'balanced', 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 10, 'rf__n_estimators': 500}
              precision    recall  f1-score   support

  nonviolent       0.76      0.96      0.84       913
     violent       0.94      0.69      0.80       913

    accuracy                 

#### The best model is the SVC.

## Embeddings

In [153]:
!uv pip install gensim sentence-transformers

[2mUsing Python 3.9.16 environment at: /Users/fbahena/Desktop/growth/masters/nlp_course/ia_hate/.venv[0m
[2K[2mResolved [1m32 packages[0m [2min 4.25s[0m[0m                                        [0m
[2K[2mPrepared [1m12 packages[0m [2min 52.39s[0m[0m                                           
[2mUninstalled [1m1 package[0m [2min 8ms[0m[0m
[2K[2mInstalled [1m14 packages[0m [2min 211ms[0m[0m                              [0m
 [32m+[39m [1mfilelock[0m[2m==3.19.1[0m
 [32m+[39m [1mfsspec[0m[2m==2025.10.0[0m
 [32m+[39m [1mhf-xet[0m[2m==1.2.0[0m
 [32m+[39m [1mhuggingface-hub[0m[2m==0.36.0[0m
 [32m+[39m [1mmpmath[0m[2m==1.3.0[0m
 [32m+[39m [1mnetworkx[0m[2m==3.2.1[0m
 [32m+[39m [1mregex[0m[2m==2025.11.3[0m
 [32m+[39m [1msafetensors[0m[2m==0.6.2[0m
 [32m+[39m [1msentence-transformers[0m[2m==5.1.2[0m
 [32m+[39m [1msympy[0m[2m==1.14.0[0m
 [32m+[39m [1mtokenizers[0m[2m==0.22.1[0m
 [32m+[39m [1mtorch

Lets start with static embeddings, which is embeddings for words and then we will mean the pero word embeddings for each sentence. The mean reduction is quite important althoug simple it does provide a good semantic summary. 

In [156]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz

--2025-11-19 01:44:09--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 2600:9000:289d:2c00:13:6e38:acc0:93a1, 2600:9000:289d:5600:13:6e38:acc0:93a1, 2600:9000:289d:7600:13:6e38:acc0:93a1, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|2600:9000:289d:2c00:13:6e38:acc0:93a1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4500107671 (4.2G) [application/octet-stream]
Saving to: ‚Äòcc.es.300.bin.gz‚Äô


2025-11-19 02:13:13 (2.46 MB/s) - ‚Äòcc.es.300.bin.gz‚Äô saved [4500107671/4500107671]



In [158]:
from gensim.models.fasttext import load_facebook_vectors 

ft = load_facebook_vectors("cc.es.300.bin") 

def sentence_vec(s): 
    toks = s.lower().split() 
    vecs = [ft[w] for w in toks if w in ft.key_to_index] 
    return np.mean(vecs, axis=0) if vecs else np.zeros(ft.vector_size)

In [159]:
import numpy as np
from tqdm import tqdm  # optional progress bar

# Assuming your data:
# X_train, y_train, X_test, y_test
# Each X_* contains the text sentences (str)

# 1. Compute sentence vectors
def build_fasttext_matrix(texts):
    return np.vstack([sentence_vec(s) for s in tqdm(texts)])

# 2. Transform both train and test sets
X_train_vec = build_fasttext_matrix(X_train)
X_test_vec  = build_fasttext_matrix(X_test)

print("Train matrix shape:", X_train_vec.shape)
print("Test matrix shape:", X_test_vec.shape)

  0%|                                                                                                                                                                          | 1/8201 [00:00<00:01, 4275.54it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1826/1826 [00:00<00:00, 32548.39it/s]

Train matrix shape: (1, 300)
Test matrix shape: (1826, 300)





In [161]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, digits=3))

ValueError: Found input variables with inconsistent numbers of samples: [1, 8201]

In [22]:
data = df
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)

In [23]:
# best_svc.fit(X_train, y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_svc.best_score_)
print(best_svc.best_params_)
y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc_train = classification_report(y_train, y_pred_svc_train)
print(report_svc_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

NameError: name 'best_svc' is not defined

In [None]:
# Define row and column labels

myConfMatrix(confusion_matrix(y_test, y_pred_svc,labels=cm_labels),cm_labels,cm_columns)

#### Balanced Dataset

In [None]:
data = bdf
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)

In [None]:
best_svc.fit(X_train, y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_svc.best_score_)
print(best_svc.best_params_)
y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc_train = classification_report(y_train, y_pred_svc_train)
print(report_svc_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

In [None]:
y_pred_svc

In [None]:
confusion_matrix(y_test, y_pred_svc,labels=cm_labels)

In [None]:
# Define row and column labels

myConfMatrix(confusion_matrix(y_test, y_pred_svc,labels=cm_labels),cm_labels,cm_columns)

#### Comments on balanced vs full:

Precision is higher on balanced dataset vs full. .97 > .93, but the loss in recall is gigantic. 
.20 < .94.  

The SVC trained on full data seems to loose a lot of information when balanced, maybe due to the fact that it is being trained on roughly half the data when balanced, and that causes information loss. 

balanced = 6056 observations
full = 11700 observations

In [None]:
bdf.shape

## RF

Results on Full and Balanced Dataset

#### Full dataset

In [None]:
data = df
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)

In [None]:
best_rf.fit(X_train,y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_rf.best_score_)
pprint(best_rf.best_params_)
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_train = best_rf.predict(X_train)
report_rf_train = classification_report(y_train, y_pred_rf_train)
print(report_rf_train)
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

In [None]:
myConfMatrix(confusion_matrix(y_test, y_pred_rf,labels=labels),labels,columns)

#### Balanced Dataset

In [None]:
X_train, X_test, y_train, y_test =\
train_test_split(bdf.loc[:,["feature"]],bdf["label"], test_size= 0.3, random_state= 1)

In [None]:
best_rf.fit(X_train,y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_rf.best_score_)
pprint(best_rf.best_params_)
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_train = best_rf.predict(X_train)
report_rf_train = classification_report(y_train, y_pred_rf_train)
print(report_rf_train)
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

In [None]:

myConfMatrix(confusion_matrix(y_test, y_pred_rf,labels=labels),labels,columns)

precision .94, recall is .73 not so good with false negatives.

Comment RF Balanced vs Full: 

comparing balanced to full, precision .94 > .90, recall .73 < .97.  The behavior is consistent with what we observed with SVM.
marginal precision comes at a very steep recall cost. 

Comment RF vs SVM: 

SVM is slightly better than RF in terms of precision.

precision  SVM .93 >  RF. 90  recall SVM .94 < .97

Opinion for Giepetto

Here's a breakdown of what might be happening with the SVM and Random Forest (RF) models across the imbalanced and balanced datasets, and guidance on what to prioritize given your use case:

Key Observations
SVM Results:

Imbalanced dataset: Precision = 0.93, Recall = 0.94 (Good balance between precision and recall)

Balanced dataset: Precision = 0.97, Recall = 0.20 (High precision but very low recall)

Why?:

With the balanced dataset, SVM is likely overfitting to the violent class because it now treats the violent and nonviolent classes equally, even though nonviolent is less common in the real-world distribution.
This causes the model to become more conservative, predicting the violent label only when it's very confident, leading to high precision but poor recall.
Random Forest Results:

Imbalanced dataset: Precision = 0.90, Recall = 0.97 (Slight bias toward recall)

Balanced dataset: Precision = 0.94, Recall = 0.73 (More balanced precision-recall tradeoff)

Why?:

RF handles imbalance better than SVM, especially with the imbalanced dataset, because it can inherently deal with class distributions using its splitting criteria (e.g., Gini or entropy).
On the balanced dataset, RF becomes slightly more conservative, improving precision but reducing recall compared to the imbalanced dataset.
What to Prioritize?
Given that false positives (nonviolent classified as violent) are more critical than false negatives (violent classified as nonviolent), precision is more important in your case.

However, you also need an acceptable level of recall to ensure the model isn't ignoring too many violent instances.

Dataset Choice
Imbalanced Dataset:

Better represents the real-world distribution.
Produces models with higher recall and acceptable precision (RF: 0.90/0.97; SVM: 0.93/0.94).
Recommended if the goal is to generalize to real-world scenarios without resampling artifacts.
Balanced Dataset:

Useful if you want to emphasize precision (e.g., with SVM: 0.97 precision), but at the cost of significantly lower recall.
May not represent the natural data distribution and could lead to suboptimal performance on real-world data.
Model Choice
Random Forest seems to be performing more consistently across both datasets:

On the imbalanced dataset, RF achieves high recall (0.97) with decent precision (0.90).
On the balanced dataset, RF strikes a better balance (0.94 precision, 0.73 recall) compared to SVM.
RF's ability to handle imbalance natively makes it a robust choice, especially if you focus on imbalanced data.
SVM, while achieving higher precision on the balanced dataset (0.97), suffers from extremely low recall (0.20), which may not be acceptable for detecting violent cases.

Recommendation
Use the imbalanced dataset with Random Forest:

Precision = 0.90, Recall = 0.97
This combination ensures you capture most violent cases (high recall) while keeping false positives (nonviolent misclassified as violent) relatively low.
Consider threshold adjustment:

For RF, adjust the decision threshold to further fine-tune the precision-recall tradeoff based on your specific requirements.
Monitor real-world performance:

Test the selected model on a held-out or real-world dataset to confirm it performs well on the expected distribution.


If you're uncertain about the real-world prevalence of hate speech, here are strategies to handle this ambiguity:

1. Balanced Dataset for Training
Using the balanced dataset (e.g., 3000 hate speech and 3000 non-hate speech) might help avoid biasing the model toward hate speech and result in a more conservative classifier.
While this might slightly under-detect hate speech (lower recall), it aligns better with your goal of minimizing false positives.
2. Imbalanced Dataset with Class Weighting
Train on the full imbalanced dataset but apply class weighting to penalize misclassifications of non-hate speech more heavily.
Example with Scikit-learn:
python
Copy code
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    class_weight={'violent': 1, 'nonviolent': 2},  # Adjust weights to penalize nonviolent misclassification more
    random_state=42
)
3. Threshold Adjustment
Even with an imbalanced dataset, you can adjust the decision threshold for the hate speech class to prioritize precision. This reduces the likelihood of false positives.
4. External Validation
Validate your model on a separate dataset (if available) that reflects a more typical distribution of hate vs. non-hate speech to ensure generalizability.

Given the fact that the full dataset is very imbalanced. we will compare both best models on an identical test data. 

In [None]:
# param_grid_rf = {
#     "rf__C": [1, 10, 100],
#     "rf__kernel": ['linear', 'rbf', 'sigmoid'],
#     "rf__gamma": ["auto","scale"]
# }

param_grid_rf = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'max_features': ['sqrt', 'log2'],
    # 'bootstrap': [True, False],
    # 'criterion': ['gini', 'entropy']
}
best_rf = GridSearchCV(classifierRF, param_grid_rf,n_jobs=8,cv=5,scoring=make_scorer(precision_score,pos_label=VIOLENT,average="binary"))

In [None]:
best_rf.fit(X_train,y_train)


In [None]:
best_rf.fit(X_train,y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_rf.best_score_)
pprint(best_rf.best_params_)
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_train = best_rf.predict(X_train)
report_rf_train = classification_report(y_train, y_pred_rf_train)
print(report_rf_train)
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

In [None]:
oneTest = pd.DataFrame(pd.Series([
    "Las mujeres pertenencen en la cocina, y la lavander√≠a",
    "Casarse es la funci√≥n de una mujer",
    "El amor es lo mas puro",
    "texto lucha amor"
]),columns=["feature"])
best_svc.predict(oneTest)

Revisemos ahora el desempe√±o del modelo sin cross validation

In [None]:
categorical_features = ["feature"]

# numeric_transformer = Pipeline(
#     steps=[("scaler", StandardScaler())]
# )

categorical_transformer = Pipeline(
    steps=[
        ("squeez", FunctionTransformer(lambda x: x.squeeze())), # make sure you pass a series
        ("tfidf",TfidfVectorizer())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        # ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("svc",svm.SVC(random_state=0))
    ]
)

classifierSVC2 = Pipeline(
    [
        ("tfidf",TfidfVectorizer()),
        ("svc",svm.SVC(random_state=0))
    ]
)


In [None]:
mod_svc_simple = classifierSVC2.fit(X_train.squeeze(),y_train)

In [None]:
y_pred = mod_svc_simple.predict(X_test.squeeze())

In [None]:
y_pred

In [None]:
report = classification_report(y_test, y_pred, output_dict=True)
pprint(report[VIOLENT])
pprint(report[NONVIOLENT])

In [None]:
X_test.head()

In [None]:
y_test.head()