# Some Machine Learning Models

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import emoji
import spacy
import fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC

In [2]:
# Change the emoji into corresponding meaning
df = pd.DataFrame(emoji.EMOJI_DATA).T
emoji_df = df['en']
emoji_df = emoji_df.apply(lambda x: ' '.join(k for k in x[1:-1].split('_')))
dic_emoji = emoji_df.to_dict()

In [3]:
nlp = spacy.load('en_core_web_sm')
def preprocessing_text(text): 
    clean_text = text.lower()
    doc = nlp(clean_text)
    tokens = [token.text.strip() for token in doc]
    list = []
    for k in tokens:
        if k not in dic_emoji.keys():
            list.append(k)
        else:
            list.append(dic_emoji[k])
    return ' '.join(list)

In [4]:
# Initialize Hashing Vectorizer
hash_vectorizer = HashingVectorizer(n_features=1000, ngram_range=(1, 2), norm='l2')


In [5]:
# Initialize Tfidf Vectorizer
idf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000,
                             min_df=2, max_df=0.8, norm='l2', use_idf=True, smooth_idf=True)

## Test 1.1 Binary classifier that can distinguish whether the comments are "hateful"(1) or "non-hateful"(0)

### Test 1.1.1 Binary classifier that are trained by Kirk's data and tested by Kirk's data

- df (using Kirk's " train + validation" data as the training data)\
training dataset =  Krik train + Krik validation (5319 messages)

- df1 (using Kirk's "test" data as the testing data)\
test dataset = Krik test (593 messages)

In [6]:
df_train = pd.concat([pd.read_csv("../Training_machine_learning_model/HatemojiBuild_from_Kirk/train.csv"), pd.read_csv("../Training_machine_learning_model/HatemojiBuild_from_Kirk/validation.csv")])
df_train = df_train.reset_index(drop=True)

In [7]:
df1_test = pd.read_csv("../Training_machine_learning_model/HatemojiBuild_from_Kirk/test.csv")

In [8]:
# Prepare the data for training and testing (lowercase the letters)
X_train = df_train['text'].apply(lambda x: (preprocessing_text(x)))
y_train = df_train['label_gold']
X_test_1 = df1_test['text'].apply(lambda x: (preprocessing_text(x)))
y_test_1 = df1_test['label_gold']

In [9]:
X_train

0       i would love to kitchen knife some person wear...
1       i would love to kitchen knife some man dark sk...
2                  i would love to kitchen knife some fox
3       person wearing turban medium skin tone make me...
4       literally all man dark skin tone men act like ...
                              ...                        
5314    ca nt believe every handman on checkatrade is ...
5315    " i was born with a penis but i have always be...
5316              my dad said i hate pakis anguished face
5317    my dad he hates maccies rolling on the floor l...
5318    i went to the shop and called it a paki shop ,...
Name: text, Length: 5319, dtype: object

In [10]:
# Hashing Vectorizer
X_train_hash = hash_vectorizer.fit_transform(X_train)
X_test_1_hash = hash_vectorizer.transform(X_test_1)
# Tfidf Vectorizer
X_train_idf = idf_vectorizer.fit_transform(X_train)
X_test_1_idf = idf_vectorizer.transform(X_test_1)

In [11]:
# Process the data for fasttext
df_train['train_data'] = '__label__' + df_train['label_gold'].astype(str) + ' ' + df_train['text'].apply(lambda x: (preprocessing_text(x)))
df_train['train_data'].to_csv('train_data_1.txt', index=False, header=False, sep='\t')

df1_test['test_data'] = '__label__' + df1_test['label_gold'].astype(str) + ' ' + df1_test['text'].apply(lambda x: (preprocessing_text(x)))
df1_test['test_data'].to_csv('test_data_1_1.txt', index=False, header=False, sep='\t')

In [12]:
# Using fasttext to train the word vector
train_data = "train_data_1.txt"
model_path = "emoji_based_hate_speech_model_1_1.bin"

model = fasttext.train_unsupervised(train_data, dim=100)
model.save_model(model_path)

def encode_text_with_fasttext(series):
    encoded_data = series.apply(lambda x: model.get_sentence_vector(x.replace('\n', ' ')))
    return encoded_data

X_train_fast = np.vstack(encode_text_with_fasttext(X_train).values)
X_test_1_fast = np.vstack(encode_text_with_fasttext(X_test_1).values)

#### Logistic Regression

In [11]:
# Define a function to fit the LogisticRegression model with the grid research
def lr_model(X_train_vec, y_train, X_test_vec, y_test):
    lr = LogisticRegression()

    params = {
        'C': [0.001, 0.01, 0.1, 1.0, ],
        'penalty': ['l2'],
        'solver': ['sag', 'saga'],
        'class_weight': ['balanced'],
        'max_iter': [100, 200, 500]
    }
    
    
    grid_search = GridSearchCV(lr, params, cv=5,n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_lr = grid_search.best_estimator_
    best_lr.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_lr.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [14]:
# Using hashing vectorizer
print(lr_model(X_train_hash, y_train, X_test_1_hash, y_test_1))

Best Hyperparameters: {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 200, 'penalty': 'l2', 'solver': 'saga'}
Best accuracy score: 0.5829992431690704
              precision    recall  f1-score   support

           0     0.5438    0.6316    0.5844       285
           1     0.5992    0.5097    0.5509       308

    accuracy                         0.5683       593
   macro avg     0.5715    0.5707    0.5676       593
weighted avg     0.5726    0.5683    0.5670       593

[[180 105]
 [151 157]]
None


In [15]:
# Using tfidf vectorizer
print(lr_model(X_train_idf, y_train, X_test_1_idf, y_test_1))

Best Hyperparameters: {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Best accuracy score: 0.6196577992488276
              precision    recall  f1-score   support

           0     0.5734    0.5895    0.5813       285
           1     0.6100    0.5942    0.6020       308

    accuracy                         0.5919       593
   macro avg     0.5917    0.5918    0.5916       593
weighted avg     0.5924    0.5919    0.5920       593

[[168 117]
 [125 183]]
None


In [17]:
# Using fasttext
print(lr_model(X_train_fast, y_train, X_test_1_fast, y_test_1))

Best Hyperparameters: {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'sag'}
Best accuracy score: 0.5337421045558393
              precision    recall  f1-score   support

           0     0.5159    0.7404    0.6081       285
           1     0.5978    0.3571    0.4472       308

    accuracy                         0.5413       593
   macro avg     0.5569    0.5487    0.5276       593
weighted avg     0.5584    0.5413    0.5245       593

[[211  74]
 [198 110]]
None


#### SGD Classifier

In [18]:
# Define a function to fit SGD Classifier with the grid research
def sgdc_model(X_train_vec, y_train, X_test_vec, y_test):
    sgdc = SGDClassifier()


    params = {
        'loss': ['log_loss', 'modified_huber', 'perceptron', 'hinge'], 
        'alpha': [0.001, 0.01, 0.1],
        'class_weight': ['balanced'],
        }
    grid_search = GridSearchCV(sgdc, params, cv=5,n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_sgdc = grid_search.best_estimator_
    best_sgdc.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_sgdc.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [19]:
# Using hashing vectorizer
print(sgdc_model(X_train_hash, y_train, X_test_1_hash, y_test_1))

Best Hyperparameters: {'alpha': 0.001, 'class_weight': 'balanced', 'loss': 'hinge'}
Best accuracy score: 0.5843139716648158
              precision    recall  f1-score   support

           0     0.5309    0.6632    0.5897       285
           1     0.5949    0.4578    0.5174       308

    accuracy                         0.5565       593
   macro avg     0.5629    0.5605    0.5536       593
weighted avg     0.5642    0.5565    0.5522       593

[[189  96]
 [167 141]]
None


In [20]:
# Using tfidf vectorizer
print(sgdc_model(X_train_idf, y_train, X_test_1_idf, y_test_1))

Best Hyperparameters: {'alpha': 0.001, 'class_weight': 'balanced', 'loss': 'modified_huber'}
Best accuracy score: 0.618153509361362
              precision    recall  f1-score   support

           0     0.5709    0.5930    0.5818       285
           1     0.6094    0.5877    0.5983       308

    accuracy                         0.5902       593
   macro avg     0.5902    0.5903    0.5901       593
weighted avg     0.5909    0.5902    0.5904       593

[[169 116]
 [127 181]]
None


In [21]:
# Using fasttext
print(sgdc_model(X_train_fast, y_train, X_test_1_fast, y_test_1))

Best Hyperparameters: {'alpha': 0.001, 'class_weight': 'balanced', 'loss': 'modified_huber'}
Best accuracy score: 0.5282893852693823
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       285
           1     0.5194    1.0000    0.6837       308

    accuracy                         0.5194       593
   macro avg     0.2597    0.5000    0.3418       593
weighted avg     0.2698    0.5194    0.3551       593

[[  0 285]
 [  0 308]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### SVM

In [12]:
# Using SVM (with GridSearchCV to find the best hyperparameters)
def svm_model(X_train_vec, y_train, X_test_vec, y_test):
    svm = SVC()

    params = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }

    grid_search = GridSearchCV(svm, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    print("Best parameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)

    # Use best hyperparameters to fit a model
    best_svm = grid_search.best_estimator_
    best_svm.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_svm.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [23]:
# Using hashing vectorizer 
print(svm_model(X_train_hash, y_train, X_test_1_hash, y_test_1))

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.5767939368647396
Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.5767939368647396
              precision    recall  f1-score   support

           0     0.5232    0.5544    0.5383       285
           1     0.5636    0.5325    0.5476       308

    accuracy                         0.5430       593
   macro avg     0.5434    0.5434    0.5430       593
weighted avg     0.5442    0.5430    0.5431       593

[[158 127]
 [144 164]]
None


In [24]:
# Using tfidf vectorizer
print(svm_model(X_train_idf, y_train, X_test_1_idf, y_test_1))

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.620032678120513
Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.620032678120513
              precision    recall  f1-score   support

           0     0.5677    0.6035    0.5850       285
           1     0.6103    0.5747    0.5920       308

    accuracy                         0.5885       593
   macro avg     0.5890    0.5891    0.5885       593
weighted avg     0.5898    0.5885    0.5886       593

[[172 113]
 [131 177]]
None


In [15]:
# Test for an instance which does not belongs any dataset
sentence_1 = [preprocessing_text("MUGGLES NOT WELCOME")]
sentence_2 = [preprocessing_text("MUGGLES NOT WELCOME👊")]
svec_1 = idf_vectorizer.transform(sentence_1)
svec_2 = idf_vectorizer.transform(sentence_2)

svm3 = SVC(C=1, gamma='scale', kernel='linear')
svm3.fit(X_train_idf, y_train)
y_pred_1 = svm3.predict(svec_1)
y_pred_2 = svm3.predict(svec_2)
print(y_pred_1, y_pred_2)

[0] [1]


In [25]:
# Using fasttext
print(svm_model(X_train_fast, y_train, X_test_1_fast, y_test_1))

Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.535061961111622
Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.535061961111622
              precision    recall  f1-score   support

           0     0.5122    0.8070    0.6267       285
           1     0.6181    0.2890    0.3938       308

    accuracy                         0.5379       593
   macro avg     0.5652    0.5480    0.5103       593
weighted avg     0.5672    0.5379    0.5057       593

[[230  55]
 [219  89]]
None


#### Decision Tree Classifier

In [26]:
# Define a function to fit Decision Tree Classifier with the grid research
def dtc_model(X_train_vec, y_train, X_test_vec, y_test):
    dtc = DecisionTreeClassifier()


    params = {
        'criterion': ['gini', 'entropy'], 
        'max_depth': [None, 5, 10, 15], 
        'min_samples_split': [2, 5, 10, 20], 
        'min_samples_leaf': [1, 2, 5, 10],
        'class_weight': ['balanced']
    }
    grid_search = GridSearchCV(dtc, params, cv=5,n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_dtc = grid_search.best_estimator_
    best_dtc.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_dtc.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [27]:
# Using hashing vectorizer 
print(dtc_model(X_train_hash, y_train, X_test_1_hash, y_test_1))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 20}
Best accuracy score: 0.5643896901237101
              precision    recall  f1-score   support

           0     0.5271    0.7860    0.6310       285
           1     0.6369    0.3474    0.4496       308

    accuracy                         0.5582       593
   macro avg     0.5820    0.5667    0.5403       593
weighted avg     0.5841    0.5582    0.5368       593

[[224  61]
 [201 107]]
None


In [28]:
# Using tfidf vectorizer
print(dtc_model(X_train_idf, y_train, X_test_1_idf, y_test_1))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 20}
Best accuracy score: 0.5841268858882861
              precision    recall  f1-score   support

           0     0.5405    0.7719    0.6358       285
           1     0.6505    0.3929    0.4899       308

    accuracy                         0.5750       593
   macro avg     0.5955    0.5824    0.5629       593
weighted avg     0.5977    0.5750    0.5600       593

[[220  65]
 [187 121]]
None


In [29]:
# Using fasttext
print(dtc_model(X_train_fast, y_train, X_test_1_fast, y_test_1))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 20}
Best accuracy score: 0.530361828843039
              precision    recall  f1-score   support

           0     0.4954    0.7544    0.5981       285
           1     0.5597    0.2890    0.3812       308

    accuracy                         0.5126       593
   macro avg     0.5276    0.5217    0.4896       593
weighted avg     0.5288    0.5126    0.4854       593

[[215  70]
 [219  89]]
None


#### Random Forest Classifier

In [30]:
# Define a function to fit Random Forest Classifier with the grid research
def rf_model(X_train_vec, y_train, X_test_vec, y_test):
    rf = RandomForestClassifier()

    params = {'n_estimators': [50, 100, 150],
          'max_depth': [3, 5, 7, None],
          'min_samples_split': [2, 5, 10],
          'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_leaf': [1, 2, 4, 6]}
    grid_search = GridSearchCV(rf, params, cv=5,n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_rf = grid_search.best_estimator_
    best_rf.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_rf.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [31]:
# Using hashing vectorizer
print(rf_model(X_train_hash, y_train, X_test_1_hash, y_test_1))

Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 5, 'n_estimators': 100}
Best accuracy score: 0.5831847374786919
              precision    recall  f1-score   support

           0     0.5000    0.5228    0.5111       285
           1     0.5390    0.5162    0.5274       308

    accuracy                         0.5194       593
   macro avg     0.5195    0.5195    0.5193       593
weighted avg     0.5202    0.5194    0.5196       593

[[149 136]
 [149 159]]
None


In [32]:
# Using tfidf vectorizer
print(rf_model(X_train_idf, y_train, X_test_1_idf, y_test_1))

Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best accuracy score: 0.6111972075060652
              precision    recall  f1-score   support

           0     0.5744    0.5825    0.5784       285
           1     0.6086    0.6006    0.6046       308

    accuracy                         0.5919       593
   macro avg     0.5915    0.5916    0.5915       593
weighted avg     0.5921    0.5919    0.5920       593

[[166 119]
 [123 185]]
None


In [33]:
# Using fasttext
print(rf_model(X_train_fast, y_train, X_test_1_fast, y_test_1))

  warn(
  warn(


Best Hyperparameters: {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy score: 0.5514133994440475
              precision    recall  f1-score   support

           0     0.5274    0.7088    0.6048       285
           1     0.6048    0.4123    0.4903       308

    accuracy                         0.5548       593
   macro avg     0.5661    0.5606    0.5476       593
weighted avg     0.5676    0.5548    0.5453       593

[[202  83]
 [181 127]]
None


#### LightGBM

In [34]:
# Define a function to fit LightGBM with the grid research
def lgbm_model(X_train_vec, y_train, X_test_vec, y_test):
    lgbm = lgb.LGBMClassifier()
      
    params = {
        'boosting_type': ['gbdt'],
        'num_leaves': [31, 63, 127],
        'learning_rate': [0.05, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [-1],
        'min_child_samples': [20, 50, 100],
        'class_weight': ['balanced']
    }
  
    grid_search = GridSearchCV(lgbm, params, cv=5,n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_lgbm = grid_search.best_estimator_
    best_lgbm.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_lgbm.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)      
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [35]:
# Using hashing vectorizer
print(lgbm_model(X_train_hash, y_train, X_test_1_hash, y_test_1))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 50, 'n_estimators': 100, 'num_leaves': 31}
Best accuracy score: 0.5901421003119275
              precision    recall  f1-score   support

           0     0.5314    0.5649    0.5476       285
           1     0.5724    0.5390    0.5552       308

    accuracy                         0.5514       593
   macro avg     0.5519    0.5519    0.5514       593
weighted avg     0.5527    0.5514    0.5515       593

[[161 124]
 [142 166]]
None


In [36]:
# Using tfidf vectorizer
print(lgbm_model(X_train_idf, y_train, X_test_1_idf, y_test_1))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 31}
Best accuracy score: 0.6027389145488368
              precision    recall  f1-score   support

           0     0.5691    0.6211    0.5940       285
           1     0.6170    0.5649    0.5898       308

    accuracy                         0.5919       593
   macro avg     0.5931    0.5930    0.5919       593
weighted avg     0.5940    0.5919    0.5918       593

[[177 108]
 [134 174]]
None


In [16]:
# Test for an instance which does not belongs any dataset
sentence_1 = [preprocessing_text("MUGGLES NOT WELCOME")]
sentence_2 = [preprocessing_text("MUGGLES NOT WELCOME👊")]
svec_1 = idf_vectorizer.transform(sentence_1)
svec_2 = idf_vectorizer.transform(sentence_2)

lgbm3 = lgb.LGBMClassifier(boosting_type='gbdt', class_weight='balanced', learning_rate=0.05, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=31)
lgbm3.fit(X_train_idf, y_train)
y_pred_1 = lgbm3.predict(svec_1)
y_pred_2 = lgbm3.predict(svec_2)
print(y_pred_1, y_pred_2)

[0] [1]


In [37]:
# Using fasttext
print(lgbm_model(X_train_fast, y_train, X_test_1_fast, y_test_1))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 100, 'n_estimators': 100, 'num_leaves': 31}
Best accuracy score: 0.5453990691686885
              precision    recall  f1-score   support

           0     0.5189    0.5298    0.5243       285
           1     0.5563    0.5455    0.5508       308

    accuracy                         0.5379       593
   macro avg     0.5376    0.5376    0.5376       593
weighted avg     0.5383    0.5379    0.5381       593

[[151 134]
 [140 168]]
None


#### BernoulliNB

In [38]:
# Define a function to fit MultinomialNB with the grid research
def bnb_model(X_train_vec, y_train, X_test_vec, y_test):
    bnb= BernoulliNB()

    params = {'alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}
    grid_search = GridSearchCV(bnb, params, cv=5,n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    print("Best parameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)

    # Use best hyperparameters to fit a model
    best_bnb = grid_search.best_estimator_
    best_bnb.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_bnb.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [39]:
# Using tfidf vectorizer
print(bnb_model(X_train_idf, y_train, X_test_1_idf, y_test_1))

Best parameters: {'alpha': 1.0}
Best accuracy score: 0.6006708917165915
Best Hyperparameters: {'alpha': 1.0}
Best accuracy score: 0.6006708917165915
              precision    recall  f1-score   support

           0     0.5568    0.6877    0.6154       285
           1     0.6307    0.4935    0.5537       308

    accuracy                         0.5868       593
   macro avg     0.5938    0.5906    0.5846       593
weighted avg     0.5952    0.5868    0.5834       593

[[196  89]
 [156 152]]
None


#### MLP Classifier

In [40]:
# Using MLP Classifier (with GridSearchCV to find the best hyperparameters)
def mlp_model(X_train_vec, y_train, X_test_vec, y_test):
    mlp = MLPClassifier()
    params = {
      'hidden_layer_sizes': [(100,), (50, 50), (100, 50)],
      'activation': ['relu', 'tanh', 'logistic'],
      'alpha': [0.0001, 0.001, 0.01]
      }

    grid_search = GridSearchCV(mlp, params, cv=5,n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)
    print("Best parameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)

    # Use best hyperparameters to fit a model
    best_mlp = grid_search.best_estimator_
    best_mlp.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_mlp.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [41]:
# Using hashing vectorizer
print(mlp_model(X_train_hash, y_train, X_test_1_hash, y_test_1))



Best parameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.563447895373429
Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.563447895373429
              precision    recall  f1-score   support

           0     0.5000    0.4737    0.4865       285
           1     0.5356    0.5617    0.5483       308

    accuracy                         0.5194       593
   macro avg     0.5178    0.5177    0.5174       593
weighted avg     0.5185    0.5194    0.5186       593

[[135 150]
 [135 173]]
None




In [11]:
# Test for an instance which does not belongs any dataset
sentence_1 = [preprocessing_text("MUGGLES NOT WELCOME")]
sentence_2 = [preprocessing_text("MUGGLES NOT WELCOME👊")]
svec_1 = hash_vectorizer.transform(sentence_1)
svec_2 = hash_vectorizer.transform(sentence_2)

mlp3 = MLPClassifier(activation='logistic', alpha=0.01, hidden_layer_sizes=(100,))
mlp3.fit(X_train_hash, y_train)
y_pred_1 = mlp3.predict(svec_1)
y_pred_2 = mlp3.predict(svec_2)
print(y_pred_1, y_pred_2)

[0] [1]




In [42]:
# Using tfidf vectorizer
print(mlp_model(X_train_idf, y_train, X_test_1_idf, y_test_1))



Best parameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.6106340050502551
Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.6106340050502551
              precision    recall  f1-score   support

           0     0.5690    0.5930    0.5808       285
           1     0.6081    0.5844    0.5960       308

    accuracy                         0.5885       593
   macro avg     0.5886    0.5887    0.5884       593
weighted avg     0.5893    0.5885    0.5887       593

[[169 116]
 [128 180]]
None




In [43]:
# Using fasttext
print(mlp_model(X_train_fast, y_train, X_test_1_fast, y_test_1))

Best parameters: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50)}
Best accuracy score: 0.5335573175648435
Best Hyperparameters: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50)}
Best accuracy score: 0.5335573175648435
              precision    recall  f1-score   support

           0     0.5044    0.8070    0.6208       285
           1     0.5985    0.2662    0.3685       308

    accuracy                         0.5261       593
   macro avg     0.5515    0.5366    0.4947       593
weighted avg     0.5533    0.5261    0.4898       593

[[230  55]
 [226  82]]
None


#### FastText Supervised Model

In [44]:
# Using supervised model in fasttext directly
train_data = "train_data_1.txt"
model_path = "emoji_based_hate_speech_model_1_2.bin"

# Define the parameters for training
lr = 0.1 
dim = 100 
epoch = 25

model = fasttext.train_supervised(input=train_data, lr=lr, dim=dim, epoch=epoch)
model.save_model(model_path)

In [45]:
# Evaluate the model with Kirk's data
test_data_1 = "test_data_1_1.txt"

result = model.test(test_data_1)

print(result)

(591, 0.5532994923857868, 0.5532994923857868)


### Test 1.1.2 Binary classifier that are trained by Kirk's data and tested by our own data

- df (using Kirk's " train + validation" data as the training data)\
training dataset =  Krik train + Krik validation (5319 messages)

- df2: (usingour own annotated data as the testing data)\
test dataset = TikTok + YouTube dataset (200 messages)

In [12]:
df2_test = pd.read_csv("../Training_machine_learning_model/Annotate_Dataset/comments_annotated.csv")
dic = {'Hated':1, 'Non-Hated':0}
df2_test['Tag_Nalin'] = df2_test['Tag_Nalin'].map(dic)

In [47]:
# Prepare the data for training and testing (lowercase the letters)
X_test_2 = df2_test['Comment'].apply(lambda x: (preprocessing_text(x)))
y_test_2 = df2_test['Tag_Nalin']

In [48]:
# Hashing Vectorizer
X_test_2_hash = hash_vectorizer.transform(X_test_2)

# Tfidf Vectorizer
X_test_2_idf = idf_vectorizer.transform(X_test_2)


In [49]:
# Process the data for fasttext

df2_test['test_data'] = '__label__' + df2_test['Tag_Nalin'].astype(str) + ' ' + df2_test['Comment'].apply(lambda x: (preprocessing_text(x)))
df2_test['test_data'].to_csv('test_data_1_2.txt', index=False, header=False, sep='\t')

In [50]:
# Using fasttext to train the word vector
X_test_2_fast = np.vstack(encode_text_with_fasttext(X_test_2).values)

#### Logistic Regression

In [51]:
# Using hashing vectorizer
print(lr_model(X_train_hash, y_train, X_test_2_hash, y_test_2))

Best Hyperparameters: {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Best accuracy score: 0.5833751830186944
              precision    recall  f1-score   support

           0     0.5591    0.7100    0.6256       100
           1     0.6027    0.4400    0.5087       100

    accuracy                         0.5750       200
   macro avg     0.5809    0.5750    0.5671       200
weighted avg     0.5809    0.5750    0.5671       200

[[71 29]
 [56 44]]
None


In [52]:
# Using tfidf vectorizer
print(lr_model(X_train_idf, y_train, X_test_2_idf, y_test_2))

Best Hyperparameters: {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'sag'}
Best accuracy score: 0.6194696524943591
              precision    recall  f1-score   support

           0     0.5763    0.6800    0.6239       100
           1     0.6098    0.5000    0.5495       100

    accuracy                         0.5900       200
   macro avg     0.5930    0.5900    0.5867       200
weighted avg     0.5930    0.5900    0.5867       200

[[68 32]
 [50 50]]
None


In [53]:
# Using fasttext
print(lr_model(X_train_fast, y_train, X_test_2_fast, y_test_2))

Best Hyperparameters: {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'sag'}
Best accuracy score: 0.5337421045558393
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       100
           1     0.5000    1.0000    0.6667       100

    accuracy                         0.5000       200
   macro avg     0.2500    0.5000    0.3333       200
weighted avg     0.2500    0.5000    0.3333       200

[[  0 100]
 [  0 100]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### SGD Classifier

In [54]:
# Using hashing vectorizer
print(sgdc_model(X_train_hash, y_train, X_test_2_hash, y_test_2))

Best Hyperparameters: {'alpha': 0.001, 'class_weight': 'balanced', 'loss': 'hinge'}
Best accuracy score: 0.5854419680433444
              precision    recall  f1-score   support

           0     0.5649    0.7400    0.6407       100
           1     0.6232    0.4300    0.5089       100

    accuracy                         0.5850       200
   macro avg     0.5940    0.5850    0.5748       200
weighted avg     0.5940    0.5850    0.5748       200

[[74 26]
 [57 43]]
None


In [55]:
# Using tfidf vectorizer
print(sgdc_model(X_train_idf, y_train, X_test_2_idf, y_test_2))

Best Hyperparameters: {'alpha': 0.001, 'class_weight': 'balanced', 'loss': 'modified_huber'}
Best accuracy score: 0.61702568981249
              precision    recall  f1-score   support

           0     0.5630    0.6700    0.6119       100
           1     0.5926    0.4800    0.5304       100

    accuracy                         0.5750       200
   macro avg     0.5778    0.5750    0.5711       200
weighted avg     0.5778    0.5750    0.5711       200

[[67 33]
 [52 48]]
None


In [56]:
# Using fasttext
print(sgdc_model(X_train_fast, y_train, X_test_2_fast, y_test_2))

Best Hyperparameters: {'alpha': 0.1, 'class_weight': 'balanced', 'loss': 'modified_huber'}
Best accuracy score: 0.5269755409219191
              precision    recall  f1-score   support

           0     1.0000    0.0100    0.0198       100
           1     0.5025    1.0000    0.6689       100

    accuracy                         0.5050       200
   macro avg     0.7513    0.5050    0.3443       200
weighted avg     0.7513    0.5050    0.3443       200

[[  1  99]
 [  0 100]]
None


#### SVM

In [57]:
# Using hashing vectorizer
print(svm_model(X_train_hash, y_train, X_test_2_hash, y_test_2))

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.5767939368647396
Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.5767939368647396
              precision    recall  f1-score   support

           0     0.5738    0.7000    0.6306       100
           1     0.6154    0.4800    0.5393       100

    accuracy                         0.5900       200
   macro avg     0.5946    0.5900    0.5850       200
weighted avg     0.5946    0.5900    0.5850       200

[[70 30]
 [52 48]]
None


In [58]:
# Using tfidf vectorizer
print(svm_model(X_train_idf, y_train, X_test_2_idf, y_test_2))

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.620032678120513
Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.620032678120513
              precision    recall  f1-score   support

           0     0.5476    0.6900    0.6106       100
           1     0.5811    0.4300    0.4943       100

    accuracy                         0.5600       200
   macro avg     0.5644    0.5600    0.5524       200
weighted avg     0.5644    0.5600    0.5524       200

[[69 31]
 [57 43]]
None


In [59]:
# Using fasttext
print(svm_model(X_train_fast, y_train, X_test_2_fast, y_test_2))

Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.535061961111622
Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.535061961111622
              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667       100
           1     0.0000    0.0000    0.0000       100

    accuracy                         0.5000       200
   macro avg     0.2500    0.5000    0.3333       200
weighted avg     0.2500    0.5000    0.3333       200

[[100   0]
 [100   0]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Decision Tree Classifier

In [60]:
# Using hashing vectorizer
print(dtc_model(X_train_hash, y_train, X_test_2_hash, y_test_2))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 20}
Best accuracy score: 0.5647656299733341
              precision    recall  f1-score   support

           0     0.4974    0.9400    0.6505       100
           1     0.4545    0.0500    0.0901       100

    accuracy                         0.4950       200
   macro avg     0.4759    0.4950    0.3703       200
weighted avg     0.4759    0.4950    0.3703       200

[[94  6]
 [95  5]]
None


In [61]:
# Using tfidf vectorizer
print(dtc_model(X_train_idf, y_train, X_test_2_idf, y_test_2))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 20}
Best accuracy score: 0.5850669123420028
              precision    recall  f1-score   support

           0     0.4891    0.9000    0.6338       100
           1     0.3750    0.0600    0.1034       100

    accuracy                         0.4800       200
   macro avg     0.4321    0.4800    0.3686       200
weighted avg     0.4321    0.4800    0.3686       200

[[90 10]
 [94  6]]
None


In [62]:
# Using fasttext
print(dtc_model(X_train_fast, y_train, X_test_2_fast, y_test_2))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy score: 0.5303620056726955
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       100
           1     0.5000    1.0000    0.6667       100

    accuracy                         0.5000       200
   macro avg     0.2500    0.5000    0.3333       200
weighted avg     0.2500    0.5000    0.3333       200

[[  0 100]
 [  0 100]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest Classifier

In [63]:
# Using hashing vectorizer
print(rf_model(X_train_hash, y_train, X_test_2_hash, y_test_2))

Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 150}
Best accuracy score: 0.5846900883440964
              precision    recall  f1-score   support

           0     0.5667    0.6800    0.6182       100
           1     0.6000    0.4800    0.5333       100

    accuracy                         0.5800       200
   macro avg     0.5833    0.5800    0.5758       200
weighted avg     0.5833    0.5800    0.5758       200

[[68 32]
 [52 48]]
None


In [64]:
# Using tfidf vectorizer
print(rf_model(X_train_idf, y_train, X_test_2_idf, y_test_2))

Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}
Best accuracy score: 0.6117607636211884
              precision    recall  f1-score   support

           0     0.5407    0.7300    0.6213       100
           1     0.5846    0.3800    0.4606       100

    accuracy                         0.5550       200
   macro avg     0.5627    0.5550    0.5409       200
weighted avg     0.5627    0.5550    0.5409       200

[[73 27]
 [62 38]]
None


In [65]:
# Using fasttext
print(rf_model(X_train_fast, y_train, X_test_2_fast, y_test_2))

  warn(
  warn(


Best Hyperparameters: {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy score: 0.5519796080040176
              precision    recall  f1-score   support

           0     0.6667    0.3400    0.4503       100
           1     0.5570    0.8300    0.6667       100

    accuracy                         0.5850       200
   macro avg     0.6119    0.5850    0.5585       200
weighted avg     0.6119    0.5850    0.5585       200

[[34 66]
 [17 83]]
None


#### LightGBM

In [66]:
# Using hashing vectorizer
print(lgbm_model(X_train_hash, y_train, X_test_2_hash, y_test_2))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 50, 'n_estimators': 100, 'num_leaves': 31}
Best accuracy score: 0.5901421003119275
              precision    recall  f1-score   support

           0     0.5763    0.6800    0.6239       100
           1     0.6098    0.5000    0.5495       100

    accuracy                         0.5900       200
   macro avg     0.5930    0.5900    0.5867       200
weighted avg     0.5930    0.5900    0.5867       200

[[68 32]
 [50 50]]
None


In [67]:
# Using tfidf vectorizer
print(lgbm_model(X_train_idf, y_train, X_test_2_idf, y_test_2))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 31}
Best accuracy score: 0.6027389145488368
              precision    recall  f1-score   support

           0     0.5166    0.7800    0.6215       100
           1     0.5510    0.2700    0.3624       100

    accuracy                         0.5250       200
   macro avg     0.5338    0.5250    0.4920       200
weighted avg     0.5338    0.5250    0.4920       200

[[78 22]
 [73 27]]
None


In [68]:
# Using fasttext
print(lgbm_model(X_train_fast, y_train, X_test_2_fast, y_test_2))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 100, 'n_estimators': 100, 'num_leaves': 31}
Best accuracy score: 0.5453990691686885
              precision    recall  f1-score   support

           0     0.5104    0.4900    0.5000       100
           1     0.5096    0.5300    0.5196       100

    accuracy                         0.5100       200
   macro avg     0.5100    0.5100    0.5098       200
weighted avg     0.5100    0.5100    0.5098       200

[[49 51]
 [47 53]]
None


#### BernoulliNB

In [69]:
# Using tfidf vectorizer
print(bnb_model(X_train_idf, y_train, X_test_2_idf, y_test_2))

Best parameters: {'alpha': 1.0}
Best accuracy score: 0.6006708917165915
Best Hyperparameters: {'alpha': 1.0}
Best accuracy score: 0.6006708917165915
              precision    recall  f1-score   support

           0     0.5248    0.7400    0.6141       100
           1     0.5593    0.3300    0.4151       100

    accuracy                         0.5350       200
   macro avg     0.5421    0.5350    0.5146       200
weighted avg     0.5421    0.5350    0.5146       200

[[74 26]
 [67 33]]
None


#### MLPClassifier

In [70]:
# Using hashing vectorizer
print(mlp_model(X_train_hash, y_train, X_test_2_hash, y_test_2))



Best parameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.5662663832676705
Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.5662663832676705
              precision    recall  f1-score   support

           0     0.5847    0.6900    0.6330       100
           1     0.6220    0.5100    0.5604       100

    accuracy                         0.6000       200
   macro avg     0.6033    0.6000    0.5967       200
weighted avg     0.6033    0.6000    0.5967       200

[[69 31]
 [49 51]]
None




In [71]:
# Using tfidf vectorizer
print(mlp_model(X_train_idf, y_train, X_test_2_idf, y_test_2))



Best parameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.6125142347873447
Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.6125142347873447
              precision    recall  f1-score   support

           0     0.5612    0.7800    0.6527       100
           1     0.6393    0.3900    0.4845       100

    accuracy                         0.5850       200
   macro avg     0.6002    0.5850    0.5686       200
weighted avg     0.6002    0.5850    0.5686       200

[[78 22]
 [61 39]]
None




In [72]:
# Using fasttext
print(mlp_model(X_train_fast, y_train, X_test_2_fast, y_test_2))

Best parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50)}
Best accuracy score: 0.5369393615742084
Best Hyperparameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50)}
Best accuracy score: 0.5369393615742084
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       100
           1     0.5000    1.0000    0.6667       100

    accuracy                         0.5000       200
   macro avg     0.2500    0.5000    0.3333       200
weighted avg     0.2500    0.5000    0.3333       200

[[  0 100]
 [  0 100]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### FastText Supervised Model

In [73]:
# Evaluate the model with our data
test_data_2 = "test_data_1_2.txt"

result = model.test(test_data_2)

print(result)

(200, 0.595, 0.595)


## Test 1.2 Multiclass classifiers that can distinguish the types of the comments

### Test 1.2.1 Multiclass classifiers that are trained and tested by Kirk's data

- df (using Kirk's " train + validation" data as the training data)\
training dataset =  Krik train + Krik validation (5319 messages)

- df1 (using Kirk's "test" data as the testing data)\
test dataset = Krik test (593 messages)

In [13]:
# Encoding categorical variables
label_encoder = preprocessing.LabelEncoder() 
df_train['type'] = label_encoder.fit_transform(df_train['type'])
df1_test['type'] = label_encoder.fit_transform(df1_test['type'])

In [14]:
# Get the type with its type label
df_train['type_label'] = label_encoder.inverse_transform(df_train['type'])
unique_labels = df_train.groupby('type')['type_label'].agg(lambda x: x.unique().tolist())

for type_, label in unique_labels.items():
    print(f"Type: {type_}, Type label: {label}")

Type: 0, Type label: ['animosity']
Type: 1, Type label: ['dehumanizinglanguage']
Type: 2, Type label: ['derogation']
Type: 3, Type label: ['none']
Type: 4, Type label: ['threateninglanguage']


In [15]:
df_train['type_label'].value_counts()

none                    2662
derogation              1758
animosity                513
dehumanizinglanguage     205
threateninglanguage      181
Name: type_label, dtype: int64

In [77]:
X_train = df_train['text'].apply((lambda x: preprocessing_text(x)))
y_train = df_train['type']
X_test = df1_test['text'].apply((lambda x: preprocessing_text(x)))
y_test = df1_test['type']

In [78]:
# Hashing Vectorizer
X_train_hash = hash_vectorizer.fit_transform(X_train)
X_test_hash = hash_vectorizer.transform(X_test)

# Tfidf Vectorizer
X_train_idf = idf_vectorizer.fit_transform(X_train)
X_test_idf = idf_vectorizer.transform(X_test)

In [79]:
# Process the data for fasttext
df_train['train_data'] = '__label__' + df_train['type'].astype(str) + ' ' + df_train['text'].apply(lambda x: (preprocessing_text(x)))
df_train['train_data'].to_csv('train_data_2_1.txt', index=False, header=False, sep='\t')

df1_test['test_data'] = '__label__' + df1_test['type'].astype(str) + ' ' + df1_test['text'].apply(lambda x: (preprocessing_text(x)))
df1_test['test_data'].to_csv('test_data_2_1.txt', index=False, header=False, sep='\t')

In [80]:
# Using fasttext to train the word vector
train_data = "train_data_2_1.txt"
model_path = "emoji_based_hate_speech_model_2_1_1.bin"

model = fasttext.train_unsupervised(train_data, dim=100)
model.save_model(model_path)

def encode_text_with_fasttext(series):
    encoded_data = series.apply(lambda x: model.get_sentence_vector(x.replace('\n', ' ')))
    return encoded_data

X_train_fast = np.vstack(encode_text_with_fasttext(X_train).values)
X_test_fast = np.vstack(encode_text_with_fasttext(X_test).values)

In [81]:
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import SMOTE #SMOTE is an oversampling technique that generates synthetic samples from the minority class.

#### Logistic Regression

In [82]:
# With SMOTE strategy
def lr_test_model(X_train_vec, y_train, X_test_vec, y_test):
    lr_test_model = LogisticRegression()

    params = {
        'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
        'penalty': ['l2'],
        'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
        'class_weight': ['balanced'],
        'max_iter': [1000, 3000],
        'multi_class': ['multinomial']
    }

    smote = SMOTE()
    X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train_vec, y_train)
                              
    grid_search = GridSearchCV(lr_test_model, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_oversampled, y_train_oversampled)

    # Use best hyperparameters to fit a model
    best_lr_test_model = grid_search.best_estimator_
    best_lr_test_model.fit(X_train_oversampled, y_train_oversampled)

    # Use the fitted model to predict on test data
    y_pred = best_lr_test_model.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))


In [83]:
# Using hashing vectorizer
print(lr_test_model(X_train_hash, y_train, X_test_hash, y_test))

Best Hyperparameters: {'C': 100.0, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'sag'}
Best accuracy score: 0.7517655897821187
              precision    recall  f1-score   support

           0     0.1698    0.3273    0.2236        55
           1     0.2564    0.3571    0.2985        28
           2     0.3803    0.3990    0.3894       203
           3     0.4776    0.3368    0.3951       285
           4     0.2059    0.3182    0.2500        22

    accuracy                         0.3575       593
   macro avg     0.2980    0.3477    0.3113       593
weighted avg     0.3952    0.3575    0.3673       593

[[ 18   2  16  18   1]
 [  1  10  10   7   0]
 [ 27  12  81  72  11]
 [ 59  13 102  96  15]
 [  1   2   4   8   7]]
None


In [84]:
# Without SMOTE strategy
def lr2_model(X_train_vec, y_train, X_test_vec, y_test):
    lr2 = LogisticRegression()

    params = {
        'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
        'penalty': ['l2'],
        'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
        'class_weight': ['balanced'],
        'max_iter': [1000, 3000],
        'multi_class': ['multinomial']
    }
                              
    grid_search = GridSearchCV(lr2, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_lr2 = grid_search.best_estimator_
    best_lr2.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_lr2.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [85]:
# Using hashing vectorizer
print(lr2_model(X_train_hash, y_train, X_test_hash, y_test))



Best Hyperparameters: {'C': 10.0, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'saga'}
Best accuracy score: 0.3807050552062187
              precision    recall  f1-score   support

           0     0.1368    0.4727    0.2122        55
           1     0.1440    0.6429    0.2353        28
           2     0.4135    0.2709    0.3274       203
           3     0.4595    0.1789    0.2576       285
           4     0.3235    0.5000    0.3929        22

    accuracy                         0.2715       593
   macro avg     0.2955    0.4131    0.2851       593
weighted avg     0.3939    0.2715    0.2812       593

[[ 26  11   8   9   1]
 [  3  18   4   3   0]
 [ 58  40  55  46   4]
 [102  51  63  51  18]
 [  1   5   3   2  11]]
None




In [86]:
# Using tfidf vectorizer
print(lr2_model(X_train_idf, y_train, X_test_idf, y_test))

Best Hyperparameters: {'C': 10.0, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}
Best accuracy score: 0.40289452464651754
              precision    recall  f1-score   support

           0     0.1704    0.4182    0.2421        55
           1     0.4318    0.6786    0.5278        28
           2     0.4649    0.4236    0.4433       203
           3     0.6057    0.3719    0.4609       285
           4     0.2407    0.5909    0.3421        22

    accuracy                         0.4165       593
   macro avg     0.3827    0.4966    0.4032       593
weighted avg     0.4954    0.4165    0.4333       593

[[ 23   2  11  12   7]
 [  3  19   2   4   0]
 [ 48   7  86  48  14]
 [ 60  15  84 106  20]
 [  1   1   2   5  13]]
None


In [87]:
# Using fasttext
print(lr2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'C': 10.0, 'class_weight': 'balanced', 'max_iter': 3000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'sag'}
Best accuracy score: 0.2559211410464072
              precision    recall  f1-score   support

           0     0.0985    0.9818    0.1791        55
           1     0.1333    0.2143    0.1644        28
           2     0.0000    0.0000    0.0000       203
           3     0.0000    0.0000    0.0000       285
           4     0.0000    0.0000    0.0000        22

    accuracy                         0.1012       593
   macro avg     0.0464    0.2392    0.0687       593
weighted avg     0.0154    0.1012    0.0244       593

[[ 54   1   0   0   0]
 [ 22   6   0   0   0]
 [180  23   0   0   0]
 [271  14   0   0   0]
 [ 21   1   0   0   0]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### SVM

In [88]:
def svm2_model(X_train_vec, y_train, X_test_vec, y_test):
    svm2 = SVC()

    params = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'decision_function_shape': ['ovr', 'ovo'],
        'class_weight': ['balanced']
    }

    grid_search = GridSearchCV(svm2, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_svm2 = grid_search.best_estimator_
    best_svm2.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_svm2.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [89]:
# Using hashing vectorizer with SVM
print(svm2_model(X_train_hash, y_train, X_test_hash, y_test))

Best Hyperparameters: {'C': 1, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.44274821578876633
              precision    recall  f1-score   support

           0     0.1707    0.2545    0.2044        55
           1     0.4286    0.3214    0.3673        28
           2     0.4315    0.4187    0.4250       203
           3     0.4586    0.4281    0.4428       285
           4     0.3333    0.4091    0.3673        22

    accuracy                         0.4030       593
   macro avg     0.3646    0.3664    0.3614       593
weighted avg     0.4166    0.4030    0.4082       593

[[ 14   1  11  27   2]
 [  6   9   5   8   0]
 [ 16   1  85  98   3]
 [ 46  10  94 122  13]
 [  0   0   2  11   9]]
None


In [90]:
# Using tfidf vectorizer with SVM
print(svm2_model(X_train_idf, y_train, X_test_idf, y_test))

Best Hyperparameters: {'C': 0.1, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.4788502889396586
              precision    recall  f1-score   support

           0     0.2222    0.1455    0.1758        55
           1     0.3750    0.5357    0.4412        28
           2     0.5181    0.2118    0.3007       203
           3     0.5402    0.7544    0.6296       285
           4     0.2500    0.4091    0.3103        22

    accuracy                         0.4890       593
   macro avg     0.3811    0.4113    0.3715       593
weighted avg     0.4846    0.4890    0.4542       593

[[  8   3   5  37   2]
 [  3  15   4   6   0]
 [ 14   8  43 128  10]
 [ 11  14  30 215  15]
 [  0   0   1  12   9]]
None


In [91]:
# Using fasttext
print(svm2_model(X_train_fast, y_train, X_test_fast, y_test))

Best Hyperparameters: {'C': 10, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'gamma': 'scale', 'kernel': 'linear'}
Best accuracy score: 0.24008091725079397
              precision    recall  f1-score   support

           0     0.1237    0.4364    0.1928        55
           1     0.0891    0.3214    0.1395        28
           2     0.5366    0.1084    0.1803       203
           3     0.5414    0.3439    0.4206       285
           4     0.1447    0.5000    0.2245        22

    accuracy                         0.2766       593
   macro avg     0.2871    0.3420    0.2315       593
weighted avg     0.4650    0.2766    0.2967       593

[[24  6  4 10 11]
 [10  9  0  9  0]
 [67 44 22 59 11]
 [92 39 13 98 43]
 [ 1  3  2  5 11]]
None


#### Decision Tree Classifier

In [92]:
def dtc2_model(X_train_vec, y_train, X_test_vec, y_test):
    dtc2 = DecisionTreeClassifier()

    params = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 4, 6, 8, 10, 15],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 5, 10],
        'class_weight': ['balanced']
    }

    grid_search = GridSearchCV(dtc2, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_dtc2 = grid_search.best_estimator_
    best_dtc2.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_dtc2.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))


In [93]:
# Using hashing vectorizer with Decision Tree Classifier 
print(dtc2_model(X_train_hash, y_train, X_test_hash, y_test))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy score: 0.3654826742302605
              precision    recall  f1-score   support

           0     0.1940    0.2364    0.2131        55
           1     0.4412    0.5357    0.4839        28
           2     0.3077    0.3153    0.3114       203
           3     0.4194    0.3649    0.3902       285
           4     0.1667    0.2727    0.2069        22

    accuracy                         0.3406       593
   macro avg     0.3058    0.3450    0.3211       593
weighted avg     0.3519    0.3406    0.3445       593

[[ 13   2  16  21   3]
 [  2  15   6   5   0]
 [ 18   4  64 108   9]
 [ 34  12 117 104  18]
 [  0   1   5  10   6]]
None


In [94]:
# Using tfidf vectorizer with Decision Tree Classifier 
print(dtc2_model(X_train_idf, y_train, X_test_idf, y_test))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy score: 0.3886058042566435
              precision    recall  f1-score   support

           0     0.1290    0.1455    0.1368        55
           1     0.4615    0.4286    0.4444        28
           2     0.3426    0.3645    0.3532       203
           3     0.4319    0.3895    0.4096       285
           4     0.2188    0.3182    0.2593        22

    accuracy                         0.3575       593
   macro avg     0.3168    0.3292    0.3207       593
weighted avg     0.3667    0.3575    0.3611       593

[[  8   2  20  22   3]
 [  0  12   5  11   0]
 [ 19   3  74 103   4]
 [ 35   9 112 111  18]
 [  0   0   5  10   7]]
None


In [95]:
# Using fasttext
print(dtc2_model(X_train_fast, y_train, X_test_fast, y_test))

Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy score: 0.3820263263992531
              precision    recall  f1-score   support

           0     0.1111    0.1273    0.1186        55
           1     0.1000    0.0714    0.0833        28
           2     0.3854    0.3892    0.3873       203
           3     0.4594    0.4561    0.4577       285
           4     0.0455    0.0455    0.0455        22

    accuracy                         0.3693       593
   macro avg     0.2203    0.2179    0.2185       593
weighted avg     0.3694    0.3693    0.3692       593

[[  7   2  16  29   1]
 [  3   2   5  17   1]
 [ 17   6  79  96   5]
 [ 34   9  98 130  14]
 [  2   1   7  11   1]]
None


#### Random Forest Classifier

In [96]:
def rf2_model(X_train_vec, y_train, X_test_vec, y_test):
    rf2 = RandomForestClassifier()

    params = {
          'n_estimators': [100, 150, 200],
          'max_depth': [3, 5, 7, None],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 5],
          'class_weight': ['balanced']
    }
    grid_search = GridSearchCV(rf2, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_rf2 = grid_search.best_estimator_
    best_rf2.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_rf2.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [97]:
# Using hashing vectorizer with Random Forest Classifier
print(rf2_model(X_train_hash, y_train, X_test_hash, y_test))

Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best accuracy score: 0.46192450788306605
              precision    recall  f1-score   support

           0     0.2105    0.0727    0.1081        55
           1     0.5000    0.1429    0.2222        28
           2     0.2840    0.2266    0.2521       203
           3     0.4041    0.5544    0.4675       285
           4     0.1538    0.0909    0.1143        22

    accuracy                         0.3609       593
   macro avg     0.3105    0.2175    0.2328       593
weighted avg     0.3403    0.3609    0.3357       593

[[  4   0   7  42   2]
 [  0   4   5  19   0]
 [  2   0  46 154   1]
 [ 13   4 102 158   8]
 [  0   0   2  18   2]]
None


In [98]:
# Using tfidf vectorizer with Random Forest Classifier
print(rf2_model(X_train_idf, y_train, X_test_idf, y_test))

Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best accuracy score: 0.46060907206869484
              precision    recall  f1-score   support

           0     0.1290    0.0727    0.0930        55
           1     0.5714    0.2857    0.3810        28
           2     0.3486    0.3005    0.3228       203
           3     0.4366    0.5439    0.4844       285
           4     0.1111    0.0909    0.1000        22

    accuracy                         0.3879       593
   macro avg     0.3194    0.2587    0.2762       593
weighted avg     0.3722    0.3879    0.3736       593

[[  4   1  13  35   2]
 [  0   8   6  14   0]
 [  5   0  61 136   1]
 [ 22   4  91 155  13]
 [  0   1   4  15   2]]
None


In [99]:
# Using fasttext
print(rf2_model(X_train_fast, y_train, X_test_fast, y_test))

Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best accuracy score: 0.47583923354953706
              precision    recall  f1-score   support

           0     0.1111    0.0182    0.0312        55
           1     0.0000    0.0000    0.0000        28
           2     0.3730    0.2315    0.2857       203
           3     0.4725    0.7544    0.5811       285
           4     0.0000    0.0000    0.0000        22

    accuracy                         0.4435       593
   macro avg     0.1913    0.2008    0.1796       593
weighted avg     0.3651    0.4435    0.3800       593

[[  1   0   9  45   0]
 [  0   0   5  23   0]
 [  3   0  47 153   0]
 [  5   0  62 215   3]
 [  0   0   3  19   0]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### LightGBM

In [100]:
def lgbm2_model(X_train_vec, y_train, X_test_vec, y_test):
    lgbm2 = lgb.LGBMClassifier()

    params = {
        'boosting_type': ['gbdt'],
        'num_leaves': [30, 60, 120],
        'learning_rate': [0.05, 0.1, 0.15, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [-1],
        'min_child_samples': [20, 50, 100],
        'class_weight': ['balanced']
    }


    grid_search = GridSearchCV(lgbm2, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_lgbm2 = grid_search.best_estimator_
    best_lgbm2.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_lgbm2.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [101]:
# Using hashing vectorizer with LightGBM
print(lgbm2_model(X_train_hash, y_train, X_test_hash, y_test))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.2, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 120}
Best accuracy score: 0.45534750564086607
              precision    recall  f1-score   support

           0     0.1600    0.0727    0.1000        55
           1     0.4667    0.2500    0.3256        28
           2     0.3617    0.3350    0.3478       203
           3     0.4501    0.5544    0.4969       285
           4     0.2143    0.1364    0.1667        22

    accuracy                         0.4047       593
   macro avg     0.3306    0.2697    0.2874       593
weighted avg     0.3850    0.4047    0.3887       593

[[  4   1  12  38   0]
 [  0   7   8  13   0]
 [  4   1  68 127   3]
 [ 17   6  96 158   8]
 [  0   0   4  15   3]]
None


In [102]:
# Using tfidf vectorizer with LightGBM
print(lgbm2_model(X_train_idf, y_train, X_test_idf, y_test))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 300, 'num_leaves': 120}
Best accuracy score: 0.44162110355852
              precision    recall  f1-score   support

           0     0.1026    0.0727    0.0851        55
           1     0.5000    0.3214    0.3913        28
           2     0.3706    0.3596    0.3650       203
           3     0.4684    0.5193    0.4925       285
           4     0.1304    0.1364    0.1333        22

    accuracy                         0.3997       593
   macro avg     0.3144    0.2819    0.2935       593
weighted avg     0.3899    0.3997    0.3930       593

[[  4   1  19  29   2]
 [  1   9   6  11   1]
 [  9   1  73 116   4]
 [ 24   7  93 148  13]
 [  1   0   6  12   3]]
None


In [103]:
# Using fasttext
print(lgbm2_model(X_train_fast, y_train, X_test_fast, y_test))

Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 120}
Best accuracy score: 0.4570388813048615
              precision    recall  f1-score   support

           0     0.1000    0.0364    0.0533        55
           1     0.3333    0.0357    0.0645        28
           2     0.3663    0.3103    0.3360       203
           3     0.4617    0.6351    0.5347       285
           4     0.0000    0.0000    0.0000        22

    accuracy                         0.4165       593
   macro avg     0.2523    0.2035    0.1977       593
weighted avg     0.3723    0.4165    0.3800       593

[[  2   0  11  42   0]
 [  2   1   6  19   0]
 [  5   0  63 134   1]
 [ 11   2  86 181   5]
 [  0   0   6  16   0]]
None


#### MultinomialNB

In [104]:
def mnb2_model(X_train_vec, y_train, X_test_vec, y_test):
    mnb2 = MultinomialNB()

    params = {'alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 15.0]}

    grid_search = GridSearchCV(mnb2, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_mnb2 = grid_search.best_estimator_
    best_mnb2.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_mnb2.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [105]:
# Using tfidf vectorizer with MultinomialNB
print(mnb2_model(X_train_idf, y_train, X_test_idf, y_test))

Best Hyperparameters: {'alpha': 2.0}
Best accuracy score: 0.5275387433777294
              precision    recall  f1-score   support

           0     1.0000    0.0182    0.0357        55
           1     0.0000    0.0000    0.0000        28
           2     0.4932    0.3547    0.4126       203
           3     0.5090    0.7965    0.6211       285
           4     0.0000    0.0000    0.0000        22

    accuracy                         0.5059       593
   macro avg     0.4004    0.2339    0.2139       593
weighted avg     0.5062    0.5059    0.4430       593

[[  1   0   6  48   0]
 [  0   0   8  20   0]
 [  0   0  72 131   0]
 [  0   0  58 227   0]
 [  0   0   2  20   0]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### MLPClassifier

In [106]:
def mlp2_model(X_train_vec, y_train, X_test_vec, y_test):
    mlp2 = MLPClassifier()

    params = {'hidden_layer_sizes': [(100,), (50, 50), (100, 50)],
          'activation': ['relu', 'tanh', 'logistic'],
          'alpha': [0.0001, 0.001, 0.01]}

    grid_search = GridSearchCV(mlp2, params, cv=5, n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)

    # Use best hyperparameters to fit a model
    best_mlp2 = grid_search.best_estimator_
    best_mlp2.fit(X_train_vec, y_train)

    # Use the fitted model to predict on test data
    y_pred = best_mlp2.predict(X_test_vec)

    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))

In [107]:
# Using hashing vectorizer with MLPClassifier
print(mlp2_model(X_train_hash, y_train, X_test_hash, y_test))



Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.47114175372580086
              precision    recall  f1-score   support

           0     0.3750    0.1091    0.1690        55
           1     0.5000    0.2143    0.3000        28
           2     0.4091    0.3103    0.3529       203
           3     0.5051    0.7018    0.5874       285
           4     0.3333    0.2273    0.2703        22

    accuracy                         0.4722       593
   macro avg     0.4245    0.3125    0.3359       593
weighted avg     0.4535    0.4722    0.4430       593

[[  6   0  14  35   0]
 [  0   6   9  13   0]
 [  2   2  63 133   3]
 [  8   4  66 200   7]
 [  0   0   2  15   5]]
None




In [108]:
# Using tfidf vectorizer with MLPClassifier
print(mlp2_model(X_train_idf, y_train, X_test_idf, y_test))



Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Best accuracy score: 0.5038520572362232
              precision    recall  f1-score   support

           0     0.2353    0.1455    0.1798        55
           1     0.6429    0.3214    0.4286        28
           2     0.4831    0.4236    0.4514       203
           3     0.5442    0.6702    0.6006       285
           4     0.1250    0.0909    0.1053        22

    accuracy                         0.4992       593
   macro avg     0.4061    0.3303    0.3531       593
weighted avg     0.4837    0.4992    0.4840       593

[[  8   1  14  29   3]
 [  0   9   6  13   0]
 [ 11   0  86 104   2]
 [ 15   4  66 191   9]
 [  0   0   6  14   2]]
None




In [109]:
# Using fasttext
print(mlp2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100, 50)}
Best accuracy score: 0.5029132685901019
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        55
           1     0.0000    0.0000    0.0000        28
           2     0.4694    0.1133    0.1825       203
           3     0.4862    0.9263    0.6377       285
           4     0.0000    0.0000    0.0000        22

    accuracy                         0.4840       593
   macro avg     0.1911    0.2079    0.1640       593
weighted avg     0.3943    0.4840    0.3690       593

[[  0   0   3  52   0]
 [  0   0   2  26   0]
 [  0   0  23 180   0]
 [  0   0  20 264   1]
 [  0   0   1  21   0]]
None


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### FastText Supervised Model

In [110]:
# Using supervised model in fasttext directly
train_data = "train_data_2_1.txt"
model_path = "emoji_based_hate_speech_model_2_1_2.bin"

# Define the parameters for training
lr = 0.1 
dim = 100 
epoch = 25

model = fasttext.train_supervised(input=train_data, lr=lr, dim=dim, epoch=epoch)
model.save_model(model_path)

In [111]:
# Evaluate the model with Kirk's data
test_data = "test_data_2_1.txt"

result = model.test(test_data)

print(result)

(591, 0.46362098138747887, 0.46362098138747887)


### Test 1.2.2 Multiclass classifiers that are trained and tested by our own data


- df2: (usingour own annotated data as the training / testing data)\
dataset = TikTok + YouTube dataset (200 messages)

In [16]:
# Load data
df2 = pd.read_csv("../Training_machine_learning_model/Annotate_Dataset/comments_annotated.csv")

# Encoding categorical variables
label_encoder = preprocessing.LabelEncoder() 
df2['Type_Nalin'] = label_encoder.fit_transform(df2['Type_Nalin'])

In [17]:
# Get the type with its type label
df2['type_label'] = label_encoder.inverse_transform(df2['Type_Nalin'])
unique_labels = df2.groupby('Type_Nalin')['type_label'].agg(lambda x: x.unique().tolist())

for type_, label in unique_labels.items():
    print(f"Type: {type_}, Type label: {label}")

Type: 0, Type label: ['accusatory']
Type: 1, Type label: ['acknowledgement']
Type: 2, Type label: ['admiration']
Type: 3, Type label: ['amazement']
Type: 4, Type label: ['amusement']
Type: 5, Type label: ['concerning']
Type: 6, Type label: ['criticism']
Type: 7, Type label: ['derogatory']
Type: 8, Type label: ['discriminatory']
Type: 9, Type label: ['disgust']
Type: 10, Type label: ['excitement']
Type: 11, Type label: ['humorous']
Type: 12, Type label: ['mocking']
Type: 13, Type label: ['rebellious']
Type: 14, Type label: ['respect']
Type: 15, Type label: ['sarcastic']
Type: 16, Type label: ['support']
Type: 17, Type label: ['sweet']
Type: 18, Type label: ['threat']


In [114]:
df2['type_label'].value_counts()

sarcastic          30
criticism          21
amazement          17
admiration         16
mocking            15
humorous           14
amusement          13
support            10
concerning          8
discriminatory      7
acknowledgement     7
derogatory          6
threat              6
sweet               6
rebellious          5
disgust             5
excitement          5
accusatory          5
respect             4
Name: type_label, dtype: int64

In [115]:
X = df2['Comment'].apply((lambda x: preprocessing_text(x)))
y = df2['Type_Nalin']

In [116]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [117]:
# Hashing Vectorizer
X_train_hash = hash_vectorizer.fit_transform(X_train)
X_test_hash = hash_vectorizer.transform(X_test)

# Tfidf Vectorizer
X_train_idf = idf_vectorizer.fit_transform(X_train)
X_test_idf = idf_vectorizer.transform(X_test)

In [118]:
# Process the data for fasttext
df2['train_data'] = '__label__' + df2['Type_Nalin'].astype(str) + ' ' + df2['Comment'].apply(lambda x: (preprocessing_text(x)))
df2['train_data'].to_csv('train_data_2_2.txt', index=False, header=False, sep='\t')

In [119]:
# Using fasttext to train the word vector
train_data = "train_data_2_2.txt"
model_path = "emoji_based_hate_speech_model_2_2.bin"

model = fasttext.train_unsupervised(train_data, dim=100)
model.save_model(model_path)

def encode_text_with_fasttext(series):
    encoded_data = series.apply(lambda x: model.get_sentence_vector(x.replace('\n', ' ')))
    return encoded_data

X_train_fast = np.vstack(encode_text_with_fasttext(X_train).values)
X_test_fast = np.vstack(encode_text_with_fasttext(X_test).values)

#### Logistic Regression

In [120]:
# Using hashing vectorizer
print(lr2_model(X_train_hash, y_train, X_test_hash, y_test))



Best Hyperparameters: {'C': 100.0, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}
Best accuracy score: 0.1375
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.6000    0.5000    0.5455         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.5000    0.3333    0.4000         3
           6     0.2857    0.5000    0.3636         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.2500    0.3333    0.2857         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          14     0.0000    0.0000    0.0000         0
          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [121]:
# Using tfidf vectorizer
print(lr2_model(X_train_idf, y_train, X_test_idf, y_test))



Best Hyperparameters: {'C': 10.0, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}
Best accuracy score: 0.1625
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.4000    0.3333    0.3636         6
           3     0.2000    0.1667    0.1818         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.5000    0.2500    0.3333         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.3333    0.3333    0.3333         3
          12     0.1667    0.5000    0.2500         2
          13     0.0000    0.0000    0.0000         1
          14     0.0000    0.0000    0.0000         0
          1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
# Using fasttext
print(lr2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'C': 100.0, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}
Best accuracy score: 0.1375
              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667         1
           1     0.0000    0.0000    0.0000         2
           2     1.0000    0.3333    0.5000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.0000    0.0000    0.0000         4
           7     0.0000    0.0000    0.0000         1
           8     0.0000    0.0000    0.0000         0
           9     0.5000    1.0000    0.6667         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### SVM

In [123]:
# Using hashing vectorizer
print(svm2_model(X_train_hash, y_train, X_test_hash, y_test))



Best Hyperparameters: {'C': 1, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.1625
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.4000    0.6667    0.5000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.3000    0.7500    0.4286         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          14     0.0000    0.0000    0.0000         0
          15     0.0833    0.2500

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [124]:
# Using tfidf vectorizer
print(svm2_model(X_train_idf, y_train, X_test_idf, y_test))



Best Hyperparameters: {'C': 10, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.15625
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.1667    0.1667    0.1667         6
           3     0.2000    0.1667    0.1818         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.5000    0.2500    0.3333         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.1667    0.5000    0.2500         2
          13     0.0000    0.0000    0.0000         1
          14     0.0000    0.0000    0.0000         0
          15     0.1429    0.25

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [125]:
# Using fasttext
print(svm2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'C': 10, 'class_weight': 'balanced', 'decision_function_shape': 'ovr', 'gamma': 'scale', 'kernel': 'rbf'}
Best accuracy score: 0.13125
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.5000    0.1667    0.2500         6
           3     0.5000    0.1667    0.2500         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.5000    0.2500    0.3333         4
           7     0.0000    0.0000    0.0000         1
           8     0.0000    0.0000    0.0000         0
           9     1.0000    1.0000    1.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.2000    0.3333    0.2500         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          14     0.0000    0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Decision Tree Classifier

In [126]:
# Using hashing vectorizer
print(dtc2_model(X_train_hash, y_train, X_test_hash, y_test))



Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best accuracy score: 0.16875
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.7500    0.5000    0.6000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.1667    0.2500    0.2000         4
           7     0.0000    0.0000    0.0000         1
           8     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.3333    0.5000    0.4000         2
          13     0.0000    0.0000    0.0000         1
          14     0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [127]:
# Using tfidf vectorizer
print(dtc2_model(X_train_idf, y_train, X_test_idf, y_test))



Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 20}
Best accuracy score: 0.1125
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.1667    0.3333    0.2222         3
           6     0.0667    0.2500    0.1053         4
           7     0.0000    0.0000    0.0000         1
           8     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          14     0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [128]:
# Using fasttext
print(dtc2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best accuracy score: 0.11875
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     1.0000    0.6667    0.8000         3
           6     0.0000    0.0000    0.0000         4
           7     0.0000    0.0000    0.0000         1
           8     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          14     0.000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest Classifier

In [129]:
# Using hashing vectorizer
print(rf2_model(X_train_hash, y_train, X_test_hash, y_test))



Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best accuracy score: 0.21875
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     1.0000    0.1667    0.2857         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.1111    0.2500    0.1538         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.4000    0.6667    0.5000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          15     0.0526    0.2500    0.0870         4
          16     0.000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [130]:
# Using tfidf vectorizer
print(rf2_model(X_train_idf, y_train, X_test_idf, y_test))



Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best accuracy score: 0.15625
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.3333    0.1667    0.2222         6
           3     0.5000    0.1667    0.2500         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.1818    0.5000    0.2667         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.1429    0.5000    0.2222         2
          13     0.0000    0.0000    0.0000         1
          14     0.0000    0.0000    0.0000         0
          15     0.0000  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [131]:
# Using fasttext
print(rf2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy score: 0.16875
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.4000    0.3333    0.3636         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.0000    0.0000    0.0000         4
           7     0.0000    0.0000    0.0000         1
           8     0.0000    0.0000    0.0000         0
           9     0.3333    1.0000    0.5000         1
          10     0.0000    0.0000    0.0000         2
          11     0.2500    0.3333    0.2857         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          14     0.000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### LightGBM

In [132]:
# Using hashing vectorizer
print(lgbm2_model(X_train_hash, y_train, X_test_hash, y_test))



Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.15, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 30}
Best accuracy score: 0.1
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.0000    0.0000    0.0000         4
           7     0.1429    1.0000    0.2500         1
           8     0.0000    0.0000    0.0000         0
           9     0.3333    1.0000    0.5000         1
          10     0.0000    0.0000    0.0000         2
          11     0.5000    0.6667    0.5714         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [133]:
# Using tfidf vectorizer
print(lgbm2_model(X_train_idf, y_train, X_test_idf, y_test))



Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 100, 'n_estimators': 100, 'num_leaves': 30}
Best accuracy score: 0.075
              precision    recall  f1-score   support

           0     0.0250    1.0000    0.0488         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.0000    0.0000    0.0000         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          15     0.0000    0.0000  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [134]:
# Using fasttext
print(lgbm2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.2, 'max_depth': -1, 'min_child_samples': 50, 'n_estimators': 100, 'num_leaves': 30}
Best accuracy score: 0.175
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.3333    0.2500    0.2857         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          14     0.0000    0.0000    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### MultinomialNB

In [135]:
# Using tfidf vectorizer
print(mnb2_model(X_train_idf, y_train, X_test_idf, y_test))

Best Hyperparameters: {'alpha': 0.5}
Best accuracy score: 0.21875
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.5000    0.1667    0.2500         6
           3     0.3333    0.1667    0.2222         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.6667    0.5000    0.5714         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.5000    0.3333    0.4000         3
          12     0.3333    0.5000    0.4000         2
          13     0.0000    0.0000    0.0000         1
          15     0.1364    0.7500    0.2308         4
          16     0.3333    1.0000    0.5000         1
          17     0.0000    0.0000    0.0000         2

    accuracy  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### MLPClassifier

In [136]:
# Using hashing vectorizer
print(mlp2_model(X_train_hash, y_train, X_test_hash, y_test))





Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50)}
Best accuracy score: 0.18125
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.4000    0.5000    0.4444         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          15     0.1143    1.0000    0.2051         4
          16     0.0000    0.0000    0.0000         1
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
# Using tfidf vectorizer
print(mlp2_model(X_train_idf, y_train, X_test_idf, y_test))





Best Hyperparameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50)}
Best accuracy score: 0.25
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.3333    0.2500    0.2857         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     1.0000    0.5000    0.6667         2
          13     0.0000    0.0000    0.0000         1
          15     0.0800    0.5000    0.1379         4
          16     0.5000    1.0000    0.6667         1
          17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [138]:
# Using fasttext
print(mlp2_model(X_train_fast, y_train, X_test_fast, y_test))



Best Hyperparameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50)}
Best accuracy score: 0.19375
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.0000    0.0000    0.0000         2
           2     0.0000    0.0000    0.0000         6
           3     0.0000    0.0000    0.0000         6
           4     0.0000    0.0000    0.0000         1
           5     0.0000    0.0000    0.0000         3
           6     0.1667    0.2500    0.2000         4
           7     0.0000    0.0000    0.0000         1
           9     0.0000    0.0000    0.0000         1
          10     0.0000    0.0000    0.0000         2
          11     0.0000    0.0000    0.0000         3
          12     0.0000    0.0000    0.0000         2
          13     0.0000    0.0000    0.0000         1
          15     0.1429    0.7500    0.2400         4
          16     0.0000    0.0000    0.0000         1
          17 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
