### Modelos a entrenar

1. Máquinas de soporte vectorial SVM
2. Bosques Aleatorios RF

In [9]:
from pprint import pprint
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics \
import classification_report, recall_score, accuracy_score,precision_score, make_scorer,confusion_matrix
from sklearn.model_selection import GridSearchCV

def myConfMatrix(confusion_array,labels,columns):
    df = pd.DataFrame(confusion_array, index=labels, columns=columns)
    return df

def balanceDF(df,labels_dict):
    violent = df.loc[df["label"] == labels_dict["VIOLENT"]]
    nonviolent = df.loc[df["label"] == labels_dict["NONVIOLENT"]]
    violent_patched = violent.sample(nonviolent.shape[0],random_state=0)
    #balanced
    bdf = pd.concat([violent_patched,nonviolent])
    return bdf


cm_labels = ['violent', 'nonviolent']
cm_columns = ['Predicted violent', 'predicted nonviolent']

In [10]:


labels_dict = {
    "VIOLENT": "violent",
    "NONVIOLENT":"nonviolent"
}

df = pd.read_csv("../master_data/data.csv")
# mapping label to 1 -> violent 0 -> non-violent
df["label"] = df["label"].apply(lambda x: labels_dict["VIOLENT"] if x == 1 else labels_dict["NONVIOLENT"])
df['feature'] = df['feature'].str.replace('\xa0', ' ', regex=False)
bdf = balanceDF(df,labels_dict)



In [11]:
df.label.value_counts()

label
violent       8689
nonviolent    3028
Name: count, dtype: int64

In [12]:
bdf.label.value_counts()

label
violent       3028
nonviolent    3028
Name: count, dtype: int64

Define pipelines, 
Note df is full dataset and bdf is balanced dataset

In [13]:
# numeric_features = ["length", "punct"]
categorical_features = ["feature"]

# numeric_transformer = Pipeline(
#     steps=[("scaler", StandardScaler())]
# )

# categorical_transformer = Pipeline(
#     steps=[
#         ("squeeze", FunctionTransformer(lambda x: x.squeeze(),validate=True)), # make sure you pass a series
#         ("tfidf",TfidfVectorizer())
#     ]
# )

# preprocessor = ColumnTransformer(
#     transformers=[
#         # ("num", numeric_transformer, numeric_features),
#         ("cat", categorical_transformer, categorical_features)
#     ]
# )

preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(), "feature")  # Apply TfidfVectorizer to the 'feature' column
    ]
)

classifierRF = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("rf",RandomForestClassifier(n_estimators=100,random_state=0,criterion="gini"))
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("svc",svm.SVC(random_state=0))
    ]
)

classifierSVC2 = Pipeline(
    [
        ("tfidf",TfidfVectorizer()),
        ("svc",svm.SVC(random_state=0))
    ]
)


param_grid_svc = {
    "svc__C": [.1,1, 10, 100],
    "svc__kernel": ['linear', 'rbf', 'sigmoid'],
    "svc__gamma": ["auto","scale"]
}

# param_grid_svc = {
#     "svc__C": [100],
#     "svc__kernel": ['rbf'],
#     "svc__gamma": ["auto"]
# }

best_svc = \
GridSearchCV(classifierSVC, param_grid_svc,n_jobs=8,cv=5,
             #scoring="precision"
             scoring=make_scorer(precision_score,pos_label=labels_dict["VIOLENT"],average="binary",zero_division=0.0)
)

param_grid_rf = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__bootstrap': [True, False],
    'rf__criterion': ['gini', 'entropy']
}

best_rf = GridSearchCV(classifierRF, param_grid_rf,n_jobs=8,cv=5,
                       scoring=make_scorer(precision_score,pos_label=labels_dict["VIOLENT"],average="binary",zero_division=0.0)
                        )


In [6]:
# condition = np.logical_or(bdf["feature"].str.contains("robamaridos"), bdf["feature"].str.contains("ella tiene que pedirle permiso"))


# corpus = bdf.loc[condition,["feature"]].squeeze()
# print(corpus.head())
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names_out())

# print(X.shape)
# X.toarray()

transformed_data = preprocessor.fit_transform(X_train)
transformed_data.toarray()[188,:]

NameError: name 'X_train' is not defined

## SVC 
Hyperparameter tuning with both full and balanced dataset

#### Full dataset

In [14]:
data = df
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)

In [15]:
best_svc.fit(X_train, y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_svc.best_score_)
print(best_svc.best_params_)
y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc_train = classification_report(y_train, y_pred_svc_train)
print(report_svc_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

using precission as the Best parameter to evaluate on (CV score=0.915):
{'svc__C': 10, 'svc__gamma': 'auto', 'svc__kernel': 'linear'}
              precision    recall  f1-score   support

  nonviolent       1.00      0.99      0.99      2115
     violent       1.00      1.00      1.00      6086

    accuracy                           1.00      8201
   macro avg       1.00      1.00      1.00      8201
weighted avg       1.00      1.00      1.00      8201

              precision    recall  f1-score   support

  nonviolent       0.81      0.79      0.80       913
     violent       0.93      0.94      0.93      2603

    accuracy                           0.90      3516
   macro avg       0.87      0.86      0.87      3516
weighted avg       0.90      0.90      0.90      3516



In [170]:
# Define row and column labels

myConfMatrix(confusion_matrix(y_test, y_pred_svc,labels=cm_labels),cm_labels,cm_columns)

Unnamed: 0,Predicted violent,predicted nonviolent
violent,2440,163
nonviolent,196,717


#### Balanced Dataset

In [172]:
data = bdf
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)

In [173]:
best_svc.fit(X_train, y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_svc.best_score_)
print(best_svc.best_params_)
y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc_train = classification_report(y_train, y_pred_svc_train)
print(report_svc_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

using precission as the Best parameter to evaluate on (CV score=0.964):
{'svc__C': 100, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}
              precision    recall  f1-score   support

  nonviolent       0.54      1.00      0.70      2122
     violent       0.97      0.14      0.25      2117

    accuracy                           0.57      4239
   macro avg       0.76      0.57      0.47      4239
weighted avg       0.76      0.57      0.47      4239

              precision    recall  f1-score   support

  nonviolent       0.55      1.00      0.71       906
     violent       0.98      0.20      0.33       911

    accuracy                           0.59      1817
   macro avg       0.76      0.60      0.52      1817
weighted avg       0.77      0.59      0.52      1817



In [174]:
# Define row and column labels

myConfMatrix(confusion_matrix(y_test, y_pred_svc,labels=cm_labels),cm_labels,cm_columns)

Unnamed: 0,Predicted violent,predicted nonviolent
violent,178,733
nonviolent,4,902


#### Comments on balanced vs full:

Precision is higher on balanced dataset vs full. .97 > .93, but the loss in recall is gigantic. 
.20 < .94.  

The SVC trained on full data seems to loose a lot of information when balanced, maybe due to the fact that it is being trained on roughly half the data when balanced, and that causes information loss. 

balanced = 6056 observations
full = 11700 observations

In [176]:
bdf.shape

(6056, 4)

## RF

Results on Full and Balanced Dataset

#### Full dataset

In [180]:
data = df
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["feature"]],data["label"], test_size= 0.3, random_state= 1)

In [181]:
best_rf.fit(X_train,y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_rf.best_score_)
pprint(best_rf.best_params_)
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_train = best_rf.predict(X_train)
report_rf_train = classification_report(y_train, y_pred_rf_train)
print(report_rf_train)
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)



using precission as the Best parameter to evaluate on (CV score=0.884):
{'rf__bootstrap': False,
 'rf__criterion': 'entropy',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 100}
              precision    recall  f1-score   support

  nonviolent       1.00      1.00      1.00      2115
     violent       1.00      1.00      1.00      6086

    accuracy                           1.00      8201
   macro avg       1.00      1.00      1.00      8201
weighted avg       1.00      1.00      1.00      8201

              precision    recall  f1-score   support

  nonviolent       0.90      0.68      0.77       913
     violent       0.90      0.97      0.93      2603

    accuracy                           0.90      3516
   macro avg       0.90      0.83      0.85      3516
weighted avg       0.90      0.90      0.89      3516



In [182]:
myConfMatrix(confusion_matrix(y_test, y_pred_rf,labels=labels),labels,columns)

Unnamed: 0,Predicted violent,predicted nonviolent
violent,2532,71
nonviolent,293,620


#### Balanced Dataset

In [177]:
X_train, X_test, y_train, y_test =\
train_test_split(bdf.loc[:,["feature"]],bdf["label"], test_size= 0.3, random_state= 1)

In [178]:
best_rf.fit(X_train,y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_rf.best_score_)
pprint(best_rf.best_params_)
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_train = best_rf.predict(X_train)
report_rf_train = classification_report(y_train, y_pred_rf_train)
print(report_rf_train)
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)



using precission as the Best parameter to evaluate on (CV score=0.936):
{'rf__bootstrap': True,
 'rf__criterion': 'entropy',
 'rf__max_depth': 30,
 'rf__max_features': 'log2',
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 500}
              precision    recall  f1-score   support

  nonviolent       0.89      0.99      0.94      2122
     violent       0.99      0.87      0.93      2117

    accuracy                           0.93      4239
   macro avg       0.94      0.93      0.93      4239
weighted avg       0.94      0.93      0.93      4239

              precision    recall  f1-score   support

  nonviolent       0.78      0.95      0.86       906
     violent       0.94      0.73      0.82       911

    accuracy                           0.84      1817
   macro avg       0.86      0.84      0.84      1817
weighted avg       0.86      0.84      0.84      1817



In [179]:

myConfMatrix(confusion_matrix(y_test, y_pred_rf,labels=labels),labels,columns)

Unnamed: 0,Predicted violent,predicted nonviolent
violent,669,242
nonviolent,43,863


precision .94, recall is .73 not so good with false negatives.

Comment RF Balanced vs Full: 

comparing balanced to full, precision .94 > .90, recall .73 < .97.  The behavior is consistent with what we observed with SVM.
marginal precision comes at a very steep recall cost. 

Comment RF vs SVM: 

SVM is slightly better than RF in terms of precision.

precision  SVM .93 >  RF. 90  recall SVM .94 < .97

Opinion for Giepetto

Here's a breakdown of what might be happening with the SVM and Random Forest (RF) models across the imbalanced and balanced datasets, and guidance on what to prioritize given your use case:

Key Observations
SVM Results:

Imbalanced dataset: Precision = 0.93, Recall = 0.94 (Good balance between precision and recall)

Balanced dataset: Precision = 0.97, Recall = 0.20 (High precision but very low recall)

Why?:

With the balanced dataset, SVM is likely overfitting to the violent class because it now treats the violent and nonviolent classes equally, even though nonviolent is less common in the real-world distribution.
This causes the model to become more conservative, predicting the violent label only when it's very confident, leading to high precision but poor recall.
Random Forest Results:

Imbalanced dataset: Precision = 0.90, Recall = 0.97 (Slight bias toward recall)

Balanced dataset: Precision = 0.94, Recall = 0.73 (More balanced precision-recall tradeoff)

Why?:

RF handles imbalance better than SVM, especially with the imbalanced dataset, because it can inherently deal with class distributions using its splitting criteria (e.g., Gini or entropy).
On the balanced dataset, RF becomes slightly more conservative, improving precision but reducing recall compared to the imbalanced dataset.
What to Prioritize?
Given that false positives (nonviolent classified as violent) are more critical than false negatives (violent classified as nonviolent), precision is more important in your case.

However, you also need an acceptable level of recall to ensure the model isn't ignoring too many violent instances.

Dataset Choice
Imbalanced Dataset:

Better represents the real-world distribution.
Produces models with higher recall and acceptable precision (RF: 0.90/0.97; SVM: 0.93/0.94).
Recommended if the goal is to generalize to real-world scenarios without resampling artifacts.
Balanced Dataset:

Useful if you want to emphasize precision (e.g., with SVM: 0.97 precision), but at the cost of significantly lower recall.
May not represent the natural data distribution and could lead to suboptimal performance on real-world data.
Model Choice
Random Forest seems to be performing more consistently across both datasets:

On the imbalanced dataset, RF achieves high recall (0.97) with decent precision (0.90).
On the balanced dataset, RF strikes a better balance (0.94 precision, 0.73 recall) compared to SVM.
RF's ability to handle imbalance natively makes it a robust choice, especially if you focus on imbalanced data.
SVM, while achieving higher precision on the balanced dataset (0.97), suffers from extremely low recall (0.20), which may not be acceptable for detecting violent cases.

Recommendation
Use the imbalanced dataset with Random Forest:

Precision = 0.90, Recall = 0.97
This combination ensures you capture most violent cases (high recall) while keeping false positives (nonviolent misclassified as violent) relatively low.
Consider threshold adjustment:

For RF, adjust the decision threshold to further fine-tune the precision-recall tradeoff based on your specific requirements.
Monitor real-world performance:

Test the selected model on a held-out or real-world dataset to confirm it performs well on the expected distribution.


If you're uncertain about the real-world prevalence of hate speech, here are strategies to handle this ambiguity:

1. Balanced Dataset for Training
Using the balanced dataset (e.g., 3000 hate speech and 3000 non-hate speech) might help avoid biasing the model toward hate speech and result in a more conservative classifier.
While this might slightly under-detect hate speech (lower recall), it aligns better with your goal of minimizing false positives.
2. Imbalanced Dataset with Class Weighting
Train on the full imbalanced dataset but apply class weighting to penalize misclassifications of non-hate speech more heavily.
Example with Scikit-learn:
python
Copy code
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    class_weight={'violent': 1, 'nonviolent': 2},  # Adjust weights to penalize nonviolent misclassification more
    random_state=42
)
3. Threshold Adjustment
Even with an imbalanced dataset, you can adjust the decision threshold for the hate speech class to prioritize precision. This reduces the likelihood of false positives.
4. External Validation
Validate your model on a separate dataset (if available) that reflects a more typical distribution of hate vs. non-hate speech to ensure generalizability.

Given the fact that the full dataset is very imbalanced. we will compare both best models on an identical test data. 

In [None]:
# param_grid_rf = {
#     "rf__C": [1, 10, 100],
#     "rf__kernel": ['linear', 'rbf', 'sigmoid'],
#     "rf__gamma": ["auto","scale"]
# }

param_grid_rf = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'max_features': ['sqrt', 'log2'],
    # 'bootstrap': [True, False],
    # 'criterion': ['gini', 'entropy']
}
best_rf = GridSearchCV(classifierRF, param_grid_rf,n_jobs=8,cv=5,scoring=make_scorer(precision_score,pos_label=VIOLENT,average="binary"))

In [None]:
best_rf.fit(X_train,y_train)


In [None]:
best_rf.fit(X_train,y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_rf.best_score_)
pprint(best_rf.best_params_)
y_pred_rf = best_rf.predict(X_test)
y_pred_rf_train = best_rf.predict(X_train)
report_rf_train = classification_report(y_train, y_pred_rf_train)
print(report_rf_train)
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

In [None]:
oneTest = pd.DataFrame(pd.Series([
    "Las mujeres pertenencen en la cocina, y la lavandería",
    "Casarse es la función de una mujer",
    "El amor es lo mas puro",
    "texto lucha amor"
]),columns=["feature"])
best_svc.predict(oneTest)

Revisemos ahora el desempeño del modelo sin cross validation

In [133]:
categorical_features = ["feature"]

# numeric_transformer = Pipeline(
#     steps=[("scaler", StandardScaler())]
# )

categorical_transformer = Pipeline(
    steps=[
        ("squeez", FunctionTransformer(lambda x: x.squeeze())), # make sure you pass a series
        ("tfidf",TfidfVectorizer())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        # ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("svc",svm.SVC(random_state=0))
    ]
)

classifierSVC2 = Pipeline(
    [
        ("tfidf",TfidfVectorizer()),
        ("svc",svm.SVC(random_state=0))
    ]
)


In [None]:
mod_svc_simple = classifierSVC2.fit(X_train.squeeze(),y_train)

In [None]:
y_pred = mod_svc_simple.predict(X_test.squeeze())

In [None]:
y_pred

In [None]:
report = classification_report(y_test, y_pred, output_dict=True)
pprint(report[VIOLENT])
pprint(report[NONVIOLENT])

In [None]:
X_test.head()

In [None]:
y_test.head()