#A pre-trained BERT model for humor detection
https://github.com/Moradnejad/ColBERT-Using-BERT-Sentence-Embedding-for-Humor-Detection

In [None]:
import pandas as pd
df_train = pd.read_csv('/content/drive/MyDrive/meme_classification_data/train.csv')
df_train.head(3)

Unnamed: 0,ID,image id,text,label,label_num
0,1,image_2455.jpg,- It is not our fight - Are we not part of thi...,troll,2
1,2,image_3701.jpg,THAT'S THE DIFFERENCE BETWEEN YOU AND ME YOU...,none,0
2,3,image_4166.png,- WHAT DO THE TITANIC AND THE SIXTH SENSE HAVE...,none,0


#Using the pre-trained model for classification of humor

##Loading the model 

In [None]:
# Download model from the github repo and make sure the directry structure is correct, otherwise modify it. 
import keras
model = keras.models.load_model("/content/drive/MyDrive/models/colbert-trained/")
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           [(None, 20)]         0                                            
_______________________________________________________________________________________

##Labelling the data for the model

In [None]:
#We drop the troll samples and use the classifier to test how well it performs in classification of humorous vs not humorous(none label)
def hu(label_num):
  if label_num == 0:
    return 'False'
  else:
    return 'True'

df_train = df_train[df_train['label_num'] != 2]
df_train['Humor'] = df_train['label_num'].apply(hu)
df_train.shape

(1293, 6)

##Pre-processing the text for the model

In [None]:
import subprocess
from ast import literal_eval
def run(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    out, err = process.communicate()
    print(out.decode('utf-8').strip())

print('# CPU')
run('cat /proc/cpuinfo | egrep -m 1 "^model name"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu MHz"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu cores"')

print('# RAM')
run('cat /proc/meminfo | egrep "^MemTotal"')

print('# GPU')
run('lspci | grep VGA')

print('# OS')
run('uname -a')
!pip install sentencepiece
!pip install transformers
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow import keras 

import os
from scipy.stats import spearmanr
from math import floor, ceil
from transformers import *

import seaborn as sns
import string
import re    #for regex

np.set_printoptions(suppress=True)
print(tf.__version__)
training_sample_count = 1991 
test_count = 600

MAX_SENTENCE_LENGTH = 20
MAX_SENTENCES = 5
MAX_LENGTH = 100



# CPU
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
cpu MHz		: 2199.998
cpu cores	: 1
# RAM
MemTotal:       13333568 kB
# GPU

# OS
Linux 5826f3470415 4.19.112+ #1 SMP Thu Jul 23 08:00:38 PDT 2020 x86_64 x86_64 x86_64 GNU/Linux
2.4.1


In [None]:
output_categories = ['Humor']
input_categories = ['text']
from transformers import BertTokenizer

MODEL_TYPE = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def return_id(str1, str2, truncation_strategy, length):

    inputs = tokenizer.encode_plus(str1, str2,
        add_special_tokens=True,
        max_length=length,
        truncation_strategy=truncation_strategy)

    input_ids =  inputs["input_ids"]
    input_masks = [1] * len(input_ids)
    input_segments = inputs["token_type_ids"]
    padding_length = length - len(input_ids)
    padding_id = tokenizer.pad_token_id
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)

    return [input_ids, input_masks, input_segments]


def compute_input_arrays(df, columns, tokenizer):
    model_input = []
    for xx in range((MAX_SENTENCES*3)+3):
        model_input.append([])
    
    for _, row in tqdm(df[columns].iterrows()):
        i = 0
        
        # sent
        sentences = sent_tokenize(row.text)
        for xx in range(MAX_SENTENCES):
            s = sentences[xx] if xx<len(sentences) else ''
            ids_q, masks_q, segments_q = return_id(s, None, 'longest_first', MAX_SENTENCE_LENGTH)
            model_input[i].append(ids_q)
            i+=1
            model_input[i].append(masks_q)
            i+=1
            model_input[i].append(segments_q)
            i+=1
        
        # full row
        ids_q, masks_q, segments_q = return_id(row.text, None, 'longest_first', MAX_LENGTH)
        model_input[i].append(ids_q)
        i+=1
        model_input[i].append(masks_q)
        i+=1
        model_input[i].append(segments_q)
        
    for xx in range((MAX_SENTENCES*3)+3):
        model_input[xx] = np.asarray(model_input[xx], dtype=np.int32)
        
    print(model_input[0].shape)
    return model_input

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
train_inputs = compute_input_arrays(df_train, input_categories, tokenizer)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



(1293, 20)


##Performance of model

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(train_inputs)
cutoff = y_pred.mean()
y_pred = np.where(y_pred > cutoff, 1, 0)
y_true = df_train['label_num'].values

print(classification_report(y_true,y_pred))


              precision    recall  f1-score   support

           0       0.47      0.16      0.24       604
           1       0.53      0.84      0.65       689

    accuracy                           0.52      1293
   macro avg       0.50      0.50      0.45      1293
weighted avg       0.51      0.52      0.46      1293



#Extracting Features from the pre-trained model and training on the data for humor classification

##Extracting features

In [None]:
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop,Adam
model= Model(inputs=model.input, outputs=model.layers[-2].output)
model.trainable = False
model.summary()
#We extract the 2nd last dense layer of the model and use it as features for training our model

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           [(None, 20)]         0                                            
______________________________________________________________________________________________

In [None]:
humor_pred_feats = model.predict(train_inputs)
X = humor_pred_feats
y = df_train['label_num'].values
X.shape, y.shape

((1293, 256), (1293,))

In [None]:
np.count_nonzero(y), y.shape[0]
#Balanced data set

(689, 1293)

##Training features on neural network

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop,Adam
model=Sequential() 
model.add(Dense(20,activation='relu',input_shape=(256,),kernel_initializer='he_normal'))
model.add(Dense(10,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 20)                5140      
_________________________________________________________________
dense_6 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 5,361
Trainable params: 5,361
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy','AUC'])


history = model.fit(X_train, y_train,
                    batch_size=32,
                    epochs=60,
                    verbose=1,
                    validation_data=(X_val, y_val))


In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test, batch_size=64, verbose=1)
p = y_pred.mean()
y_pred = np.where(y_pred >= p , 1 , 0)
print(y.shape, np.count_nonzero(y_pred))
print(classification_report(y_test,y_pred))

(1293,) 122
              precision    recall  f1-score   support

           0       0.46      0.54      0.50       116
           1       0.57      0.48      0.52       143

    accuracy                           0.51       259
   macro avg       0.51      0.51      0.51       259
weighted avg       0.52      0.51      0.51       259



##Training on SVM and XGB classifier

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape,X_test.shape

((1034, 256), (259, 256))

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

###SVM

In [None]:
#model = SVC(kernel = 'rbf') # 0.02 f1 on class 0
model = SVC(kernel = 'linear') # Best performance
#model = SVC(kernel = 'poly', degree = 8)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.33      0.41       116
           1       0.59      0.77      0.66       143

    accuracy                           0.57       259
   macro avg       0.56      0.55      0.54       259
weighted avg       0.56      0.57      0.55       259



###XGB Classifier

In [None]:
#model = SVC(kernel = 'rbf') # 0.02 f1 on class 0
model = XGBClassifier()# Best performance
#model = SVC(kernel = 'poly', degree = 8)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.47      0.49       116
           1       0.60      0.64      0.62       143

    accuracy                           0.56       259
   macro avg       0.56      0.56      0.56       259
weighted avg       0.56      0.56      0.56       259



#Saving the features

In [None]:
humor_train = pd.DataFrame(X, columns = range(256))
humor_train.to_csv('/content/drive/MyDrive/meme_classification_data/train_humor_feats.csv', index=False)