In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import json

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from collections import defaultdict

from googletrans import Translator
from nltk import word_tokenize
import string

import json
import requests
import sys
import hashlib
import time

# Model creation functions

### Load the embedding layer

In [3]:
def load_embedding_from_disk(embedding_filename, len_vec, with_indexes=True):
    """
    Read a GloVe txt file. and return the dictionary and/or list of embedding matrix
    input: 
        glove_filename: str, the path of glove file
        len_vec: int, a number to indicate the dimension of emedding layer
        with_indexes: boolean, if it's True, it return a tuple of two dictionnaries 
                      (word_to_index_dict, index_to_embedding_array);
                      otherwise, it return only a dictionary object, mapping from a string to a numpy array
    """
    print("Loading embedding from disks...")
    if with_indexes:
        word_to_index_dict = dict()    # key: word_string; value: index
        index_to_embedding_array = []    # matrix, each row representes the embedding array of word with corresponding index
    else:
        word_to_embedding_dict = dict()   # key: word; value: embedding array
        
    with open(embedding_filename, 'r', encoding='utf-8') as glove_file:
        for (i, line) in enumerate(glove_file):
            # for each row, the values are separated by white space. The first element is word and followed by its embedding array
            split = line.split(" ")  
            
            # if we have length of split under a certain threshold, we discard the line
            if len(split) < 5:
                continue
            
            # if we have length of split more than give dimension, we cut it up to len_vec
            # plus 1 because in split it has also a element that store the word
            if len(split) > len_vec+1:
                split = split[:len_vec+1]
            
            word = split[0]
            representation = split[1:]
            representation = np.array([float(val) for val in representation])
            
            # if we have len of dimension less than given number, we add the average list unless they have same length
            if len(representation) < len_vec:
                representation = np.append(representation, [np.mean(representation)]*(len_vec-len(representation)))
                
            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representatihon
                
        _WORD_NOT_FOUND = [0.0] * len(representation)    # empty representation for unknow words
        
        if with_indexes:
            _LAST_INDEX = i + 1
            word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
            index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])  # any unknow word, it will find the last index of the this directory
            print("Embedding loaded from disks. Return word-index dictionary and embedding matrix")
            return word_to_index_dict, index_to_embedding_array
        else:
            word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
            print("Embedding loaded from disks. Return word-embedding array dictionary")
            return word_to_embedding_dict

In [4]:
def save_var_matrix_as_tf(matrix, ckpt_path, name='Embedding'):
    """
    save the matrix of emebedded value as Tensorflow Variable. It can be used to load it directly from the next times
    input:
        matrix: list of lists, row, the embedded array of a word 
        ckpt_path: str, the path and the name of checkpoint file
        name: str, the name of tensorflow variable that you want associate to 
    """
    print("saving the checkpoint file...")
    tf_embedding = tf.Variable(tf.constant(0.0, shape=matrix.shape),
                               trainable=False,
                               name=name)

    # now, we will store embedding in Tensorflow
    tf_embedding.assign(matrix)

    embedding_saver = tf.saved_model.save(tf_embedding, export_dir=ckpt_path)
    print("checkpoint file saved.")

In [5]:
def save_json(value, json_path):
    """
    save the word-index dictionary into json format
    input:
        value, dict, with word as key and index number as value
        json_path: str, the path and name of json file
    """
    print("saving the json file...")
    if not os.path.exists(json_path):
        os.makedirs(os.path.dirname(json_path))
    with open(json_path, 'w') as f:
        json.dump(value, f)
        
    print("json file saved")

In [6]:
def load_word_to_index(dict_word_index_filename):
    """
    Load a `word_to_index` dict mapping words to their id, with a default value
    of pointing to the last index when not found, which is the unknown word.
    input: 
        dict_word_index_filename: str, the filename and path to load the word-index dictionary
    """
    with open(dict_word_index_filename, 'r') as f:
        word_to_index = json.load(f)
    _LAST_INDEX = len(word_to_index)
    print("word_to_index dict restored from '{}'.".format(dict_word_index_filename))
    word_to_index = defaultdict(lambda: _LAST_INDEX, word_to_index)

    return word_to_index

def load_embedding_tf(tf_embeddings_file_path):
    """
    load the embedding matrix from saved variable
    """
    embedding_saver = tf.saved_model.load(export_dir=tf_embeddings_file_path)
    print("TF embeddings restored from '{}'.".format(tf_embeddings_file_path))
    
    return embedding_saver

In [7]:
def pretrained_model_to_embedding_layer(pretrained_model, mod_dim, json_path, checkpoint_path):
    """
    load the pre-trained model to the tensorflow embedding layer. 
    input:
        pretrained_model: str, the path and the name of file contains the row with word and list of numbers
        mod_dim: int, the number that indicates the dimensions (columns) of embedding matrix.
        json_path: str, the path and name of json file, it has the structure "word: index"
        checkpoint_path: str, the path and name of ckpt file, pre-trained model saved as tensorflow variable  
    """
    
    word_to_index = None
    index_to_embedding = None
    
    # if we have already the json file and the chechpoint file, then we can load directly from them
    if os.path.exists(json_path) and os.path.exists(checkpoint_path):
        print("load from json and checkpoint files")
        word_to_index = load_word_to_index(json_path)
        index_to_embedding = load_embedding_tf(checkpoint_path).numpy()
    else:
        word_to_index, index_to_embedding = load_embedding_from_disk(pretrained_model, mod_dim, with_indexes=True)
        save_json(word_to_index, json_path)
        save_var_matrix_as_tf(index_to_embedding, ckpt_path=checkpoint_path)
        
    
    embedding_layer = tf.keras.layers.Embedding(len(index_to_embedding),
                                                mod_dim,
                                                weights=[index_to_embedding],
                                                trainable=False)

    
    return embedding_layer, word_to_index, index_to_embedding

In [8]:
def create_pad_sequences(list_sentences, word_index_dic, max_sequence_length):
    sequence_matrix= np.array(list(map(lambda x: np.array([word_index_dic[i] for i in x]), list_sentences)))
    data_padded = pad_sequences(sequence_matrix, maxlen=max_sequence_length)
    
    return sequence_matrix, data_padded

###  model creation function

In [9]:
def create_model(embedding_filename, embedding_dim, embedding_json, embedding_ckpt, max_sequence_length, labels_index):
    embedding_layer, word_to_index, index_to_embedding = pretrained_model_to_embedding_layer(embedding_filename, 
                                                                                             embedding_dim, 
                                                                                             embedding_json, 
                                                                                             embedding_ckpt)    

    sequence_input = tf.keras.Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    convs = []
    filter_sizes = [2,3,4,5,6]
    
    for filter_size in filter_sizes:
        l_conv = tf.keras.layers.Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv)
        convs.append(l_pool)
        
    l_merge = tf.keras.layers.concatenate(convs, axis=1)
    
    x = tf.keras.layers.Dropout(0.1)(l_merge)  
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    preds = tf.keras.layers.Dense(labels_index, activation='softmax')(x)    # sigmoid

    """
    x = tf.keras.layers.Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = tf.keras.layers.MaxPooling1D(5)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    """
    model = tf.keras.Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()
    return model

# English Part 

In [10]:
glove_filename = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/glove.6B/glove.txt"
embedding_ckpt = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/glove.6B/glove/var_checpoint"
embedding_json = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/glove.6B/glove/var.json"

checkpoint_path = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/"
model_save_path = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/model_saved"
model_name = "eng_joke.h5"

data_file_path = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/root/dataset/final/short_jokes.pickle"

embedding_dim = 50
max_sequence_length = 200

### Load data

In [15]:
df = pd.read_pickle(data_file_path)
df = df.sample(frac=1)
df.head(2)

Unnamed: 0,Joke,label,Text_Clean_Punct,Final_with_stopword,tokens_with_stopword,len_tokens_with_stopword,Final_without_stopword,tokens_without_stopword,len_tokens_without_stopword
405278,martin ellis added the attraction of shopping ...,0,martin ellis added the attraction of shopping ...,martin ellis added the attraction of shopping ...,"[martin, ellis, added, the, attraction, of, sh...",29,martin ellis added attraction shopping leisure...,"[martin, ellis, added, attraction, shopping, l...",16
261267,and today a judge in state supreme court on st...,0,and today a judge in state supreme court on st...,and today a judge in state supreme court on st...,"[and, today, a, judge, in, state, supreme, cou...",28,today judge state supreme court staten island ...,"[today, judge, state, supreme, court, staten, ...",16


#### Split in train dataset and test dataset(using Sklearn)

In [16]:
data_train, data_test = train_test_split(df, test_size=0.1, random_state=50)

In [17]:
all_training_words = [word for tokens in data_train["tokens_with_stopword"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens_with_stopword"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

10540455 words total, with a vocabulary size of 85278
Max sentence length is 268


In [18]:
all_test_words = [word for tokens in data_test["tokens_with_stopword"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens_with_stopword"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

1168836 words total, with a vocabulary size of 34989
Max sentence length is 87


### Create model

In [19]:
model = create_model(glove_filename, embedding_dim, embedding_json, embedding_ckpt, max_sequence_length, 2)

load from json and checkpoint files
word_to_index dict restored from '/Users/rosalina_chen/Desktop/humor_recognition_ver3/glove.6B/glove/var.json'.
TF embeddings restored from '/Users/rosalina_chen/Desktop/humor_recognition_ver3/glove.6B/glove/var_checpoint'.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 50)      20000050    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 199, 200)     20200       embedding[0][0]                  
________________________________

#### create the pad sequences

In [20]:
word_to_index = load_word_to_index(embedding_json)
#index_to_embedding = load_embedding_tf(embedding_ckpt).numpy()

word_to_index dict restored from '/Users/rosalina_chen/Desktop/humor_recognition_ver3/glove.6B/glove/var.json'.


##### training dataset

In [21]:
seq_train, train_data_padded = create_pad_sequences(data_train["tokens_with_stopword"].tolist(), word_to_index, max_sequence_length)

In [22]:
train_data_padded.shape

(416982, 200)

##### test dataset

In [23]:
seq_test, test_data_padded = create_pad_sequences(data_test["tokens_with_stopword"].tolist(), word_to_index, max_sequence_length)

In [24]:
test_data_padded.shape

(46332, 200)

#### label of dataset

In [25]:
label_array = [[1, 0]if v == 1 else [0, 1] for v in data_train['label'].values]

In [26]:
label_array = np.array(label_array)
label_array.shape

(416982, 2)

### Training Model

In [27]:
num_epoches = 10
batch_size = 512

In [28]:
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)
hist = model.fit(train_data_padded, label_array, epochs=num_epoches, validation_split=0.1, shuffle=True, batch_size=batch_size, callbacks=[cp_callback])

Train on 375283 samples, validate on 41699 samples
Epoch 1/10
Epoch 00001: saving model to /Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/
Epoch 2/10
Epoch 00002: saving model to /Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/
Epoch 3/10
Epoch 00003: saving model to /Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/
Epoch 4/10
Epoch 00004: saving model to /Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/
Epoch 5/10
Epoch 00005: saving model to /Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/
Epoch 6/10
Epoch 00006: saving model to /Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/
Epoch 7/10
Epoch 00007: saving model to /Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/eng_nlp/checkpoint/
Epoch 8/10
Epoch 00008: saving model to /Users/rosalina_chen/Deskto

In [29]:
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
    
model.save(os.path.join(model_save_path, model_name))

### Test CNN

In [30]:
predictions = model.predict(test_data_padded, batch_size=512, verbose=0)

In [31]:
labels = [1, 0]

In [32]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [33]:
sum(data_test.label==prediction_labels)/len(prediction_labels)

0.9945178278511612

In [34]:
data_test.head()

Unnamed: 0,Joke,label,Text_Clean_Punct,Final_with_stopword,tokens_with_stopword,len_tokens_with_stopword,Final_without_stopword,tokens_without_stopword,len_tokens_without_stopword
156556,What tick likes to run? Politicks,1,What tick likes to run Politicks,what tick likes to run ? politicks,"[what, tick, likes, to, run, ?, politicks]",7,tick likes run ? politicks,"[tick, likes, run, ?, politicks]",5
19679,Instead of politely knocking on the bathroom d...,1,Instead of politely knocking on the bathroom d...,instead of politely knocking on the bathroom d...,"[instead, of, politely, knocking, on, the, bat...",23,"instead politely knocking bathroom door , kid ...","[instead, politely, knocking, bathroom, door, ...",15
136817,What's better than double-fisting a newborn? H...,1,Whats better than doublefisting a newborn HADO...,what 's better than double-fisting a newborn ?...,"[what, 's, better, than, double-fisting, a, ne...",12,'s better double-fisting newborn ? hadouken ! ! !,"['s, better, double-fisting, newborn, ?, hadou...",9
182687,The American people should elect Gabe Newell p...,1,The American people should elect Gabe Newell p...,the american people should elect gabe newell p...,"[the, american, people, should, elect, gabe, n...",32,american people elect gabe newell president 20...,"[american, people, elect, gabe, newell, presid...",20
353616,judge marilyn patel of the us district court f...,0,judge marilyn patel of the us district court f...,judge marilyn patel of the us district court f...,"[judge, marilyn, patel, of, the, us, district,...",31,judge marilyn patel us district court northern...,"[judge, marilyn, patel, us, district, court, n...",21


#### Test using other dataset

In [35]:
new_model = tf.keras.models.load_model(os.path.join(model_save_path, model_name))

new_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 50)      20000050    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 199, 200)     20200       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 198, 200)     30200       embedding[0][0]                  
______________________________________________________________________________________________

In [36]:
test_filename = "/Users/rosalina_chen/Desktop/humor_recognition_ver3/root/dataset/final/shorttext.pickle"
data_out = pd.read_pickle(test_filename)
data_out = data_out.sample(frac=1)
data_out.head()

Unnamed: 0,sentences,label,Text_Clean_Punct,Final_with_stopword,tokens_with_stopword,len_tokens_with_stopword,Final_without_stopword,tokens_without_stopword,len_tokens_without_stopword
16628,Merkel says supports some kind of no-fly zone ...,0,Merkel says supports some kind of nofly zone i...,merkel says supports some kind of nofly zone i...,"[merkel, says, supports, some, kind, of, nofly...",10,merkel says supports kind nofly zone syria,"[merkel, says, supports, kind, nofly, zone, sy...",7
17609,"They have two children , Shane and Stephanie ,...",0,They have two children Shane and Stephanie ...,they have two children shane and stephanie who...,"[they, have, two, children, shane, and, stepha...",12,two children shane stephanie work wwe,"[two, children, shane, stephanie, work, wwe]",6
6811,I've taken up speed reading. I can read 'War a...,1,Ive taken up speed reading I can read War and...,ive taken up speed reading i can read war and ...,"[ive, taken, up, speed, reading, i, can, read,...",22,ive taken speed reading read war peace 20 seco...,"[ive, taken, speed, reading, read, war, peace,...",12
1092,"Knock, Knock. Who's there? Francis. Francis wh...",1,Knock Knock Whos there Francis Francis who...,knock knock whos there francis francis who fra...,"[knock, knock, whos, there, francis, francis, ...",11,knock knock whos francis francis francis next ...,"[knock, knock, whos, francis, francis, francis...",8
11765,A year in space: Scott Kelly and Mikhail Korni...,0,A year in space Scott Kelly and Mikhail Kornie...,a year in space scott kelly and mikhail kornie...,"[a, year, in, space, scott, kelly, and, mikhai...",12,year space scott kelly mikhail kornienko retur...,"[year, space, scott, kelly, mikhail, kornienko...",8


In [37]:
seq_test2, test2_data_padded = create_pad_sequences(data_out["tokens_with_stopword"].tolist(), word_to_index)

TypeError: create_pad_sequences() missing 1 required positional argument: 'max_sequence_length'

In [None]:
predictions_test2 = model.predict(test2_data_padded, batch_size=1024, verbose=0)

labels = [1, 0]
prediction_labels=[]
for p in predictions_test2:
    prediction_labels.append(labels[np.argmax(p)])
    
sum(data_out.label==prediction_labels)/len(prediction_labels)

# Chinse part

### Chinese embedding layer

In [11]:
tencent = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"
embedding_ckpt_ch = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/Tencent_AILab_ChineseEmbedding/tencent/var.ckpt"
embedding_json_ch = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/Tencent_AILab_ChineseEmbedding/tencent/var.json"

checkpoint_path_ch = "/Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/ch_nlp/checkpoint/"
model_save_path_ch = "/Users/rosalina_chen/Desktop/humor_recognition_ver3/model_created/ch_nlp/model_saved"
model_name_ch = "chn_joke.h5"

embedding_dim_ch = 200
max_sequence_len_ch = 200

### create model

In [1]:
ch_model = create_model(tencent, embedding_dim_ch, embedding_json_ch, embedding_ckpt_ch, max_sequence_len_ch, 2)

NameError: name 'create_model' is not defined

### Load data

In [11]:
df_ch_dataset = pd.read_pickle("./data_created/chinese_dataset.pickle")
df_ch_dataset = df_ch_dataset.sample(frac=1)
df_ch_dataset

Unnamed: 0,sentences,label,no_punc,tokens
67125,博物馆奇妙夜2 1.77 1.8% ...,0,博物馆奇妙夜2 177 18 ...,"[博物, 博物馆, 奇妙, 夜, 2, , , , , , , , , , , ,..."
43595,8:女性的花季年龄上升为45岁。,1,8女性的花季年龄上升为45岁,"[8, 女性, 的, 花季, 年龄, 上升, 升为, 45, 岁]"
34629,老李：群狼可怕还是独狼可怕？ 老张：独狼吧，胃口小，吃到一半人没死，狼饱了。。。,1,老李群狼可怕还是独狼可怕 老张独狼吧胃口小吃到一半人没死狼饱了,"[老李, 群, 狼, 可怕, 还是, 独, 狼, 可怕, , , , 老张, 独, 狼, ..."
61016,曼联今年夏天以3075万英镑的高价引进了贝尔巴托夫；然后很可能会在1月份，以2200万英镑的...,0,曼联今年夏天以3075万英镑的高价引进了贝尔巴托夫然后很可能会在1月份以2200万英镑的高价...,"[曼联, 今年, 今年夏天, 夏天, 以, 3075, 万英镑, 英镑, 的, 高价, 引进..."
16589,"希望你还保留一些厚颜,领着我看了一场又一场电影,就是不让我回家。",1,希望你还保留一些厚颜领着我看了一场又一场电影就是不让我回家,"[希望, 你, 还, 保留, 一些, 厚颜, 领, 着, 我, 看, 了, 一场, 又, 一..."
...,...,...,...,...
29244,昨晚在火车上吃泡面“老坛酸菜面”吃得正香！后座的一个7、8岁的小男孩伸头过来，之后说了一句让...,1,昨晚在火车上吃泡面老坛酸菜面吃得正香后座的一个78岁的小男孩伸头过来之后说了一句让我吐血的话...,"[昨晚, 在, 火车, 车上, 吃, 泡面, 老, 坛, 酸菜, 面, 吃, 得, 正, 香..."
31041,大师兄~！二师兄被妖怪抓走了！,1,大师兄~二师兄被妖怪抓走了,"[大师, 师兄, ~, 二, 师兄, 被, 妖怪, 抓走, 了]"
108019,然而，在目前极端不稳定的市场状况下，任何单个事件都可能导致金融市场的波动。比如5月22日，西...,0,然而在目前极端不稳定的市场状况下任何单个事件都可能导致金融市场的波动比如5月22日西班牙央行...,"[然而, 在, 目前, 极端, 不稳, 稳定, 的, 市场, 状况, 下任, 任何, 单个,..."
64640,比赛结束之后，天津队后防核心李玮锋也接受了采访，李玮锋首先表示，今天能够取得这场比赛的胜利还...,0,比赛结束之后天津队后防核心李玮锋也接受了采访李玮锋首先表示今天能够取得这场比赛的胜利还是相当...,"[比赛, 结束, 之后, 后天, 天津, 天津队, 后防, 核心, 李玮锋, 也, 接受, ..."


### training and testing dataset

In [12]:
train_dataset, test_dataset = train_test_split(df_ch_dataset, test_size=0.2)

In [13]:
word_to_index_ch = load_word_to_index(embedding_json_ch)
#index_to_embedding_ch = load_embedding_tf(embedding_ckpt_ch).numpy()

word_to_index dict restored from 'C:\Users\zhany\Documents\learning\humor_recognition\data\emb_pretrained\chn\checpoint\tencent\var.json'.
TF embeddings restored from 'C:\Users\zhany\Documents\learning\humor_recognition\data\emb_pretrained\chn\checpoint\tencent\var.ckpt'.


##### training dataset

In [16]:
seq_train_ch, train_data_padded_ch = create_pad_sequences(train_dataset["tokens"].tolist(), word_to_index_ch, max_sequence_len_ch)
train_data_padded_ch.shape

(89291, 200)

##### test dataset

In [17]:
seq_test_ch, test_data_padded_ch = create_pad_sequences(test_dataset["tokens"].tolist(), word_to_index_ch, max_sequence_len_ch)
test_data_padded_ch.shape

(22323, 200)

#### label of dataset

In [19]:
label_array_ch = [[1, 0]if v == 1 else [0, 1] for v in train_dataset['label'].values]
label_array_ch = np.array(label_array_ch)
label_array_ch.shape

(89291, 2)

## Train model

In [20]:
num_epoches = 10
batch_size = 512

In [22]:
if not os.path.exists(checkpoint_path_ch):
    os.makedirs(checkpoint_path_ch)
    
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_ch, save_weights_only=True, verbose=1)
hist = ch_model.fit(train_data_padded_ch, label_array_ch, epochs=num_epoches, validation_split=0.1, shuffle=True, batch_size=batch_size, callbacks=[cp_callback])

Train on 80361 samples, validate on 8930 samples
Epoch 1/10
Epoch 00001: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 2/10
Epoch 00002: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 3/10
Epoch 00003: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 4/10
Epoch 00004: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 5/10
Epoch 00005: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 6/10
Epoch 00006: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 7/10
Epoch 00007: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 8/10
Epoch 00008: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 9/10
Epoch 00009: saving model to ./model_created/ch_nlp/checkpoint/
Epoch 10/10
Epoch 00010: saving model to ./model_created/ch_nlp/checkpoint/


In [26]:
if not os.path.exists(model_save_path_ch):
    os.makedirs(model_save_path_ch)
    
ch_model.save(os.path.join(model_save_path_ch, model_name_ch))

## Test model

In [31]:
predictions = ch_model.predict(test_data_padded_ch, batch_size=512, verbose=0)

NameError: name 'ch_model' is not defined

In [28]:
labels = [1, 0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
    
sum(test_dataset.label==prediction_labels)/len(prediction_labels)

0.9657304125789544

# predict the eastern or/and western joke (DA QUI POSSO LANCIARE)

### Google translator

In [9]:
def baidu_translate_eng_cn(sentence):
    url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'

    # non mostrare appid e key agli altri
    appid = '20200329000408142'
    key = 'riUpq41_ifBFCFB5c6NF'

    salt = '12345654321234'   #  random number
    
    sign = hashlib.md5((appid + sentence + salt + key).encode('UTF-8')).hexdigest()
    
    en_form_data = {
    'q': sentence,         
    'from': 'auto',    
    'to': "en",   
    'appid': appid,
    'salt': salt,
    'sign': sign
    }
    en_request = requests.get(url, params=en_form_data)
    en_sentence = en_request.json()['trans_result'][0]['dst']
    
    time.sleep(1)  # pausa 1 sec, standard baidu api limitation
    
    cn_form_data = {
        'q': sentence,
        'from': 'auto',
        'to': 'zh',
        'appid': appid,
        'salt': salt,
        'sign': sign
    }
    cn_request = requests.get(url, params=cn_form_data)
    cn_sentence = cn_request.json()['trans_result'][0]['dst']
    
    return en_sentence, cn_sentence

In [10]:
baidu_translate_eng_cn("试试百度翻译")

('Try Baidu translation', '试试百度翻译')

In [11]:
def google_translate_eng_cn(sentence):
    """
    return the englesh and chinese sentence by using google translate
    """
    translator = Translator()
    eng_sentence = translator.translate(sentence, dest="en").text
    cn_sentence = translator.translate(sentence, dest="zh-cn").text
    return eng_sentence, cn_sentence

In [12]:
google_translate_eng_cn("try google translator")

('try google translator', '尝试谷歌翻译')

In [13]:
def tokenizing_word_eng(sentence):
    """
    tokenizing the input sentence, include also remove punctuation, lower casing
    """
    sentence = sentence.lower().strip()
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    return [word_tokenize(sentence)]

In [21]:
def prediction(sentence, model, word_to_index, max_seq_length):
    _, sentence_padded = create_pad_sequences(tokenizing_word_eng(sentence), word_to_index, max_seq_length)
    result = model.predict(sentence_padded)
    
    label = [1, 0]
    prediction = label[np.argmax(result)]
    
    return prediction

In [28]:
def main(translator="google"):
    
    en_word_json_path = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/root/model/dizionari/en_var.json"
    cn_word_json_path = r"/Users/rosalina_chen/Desktop/humor_recognition_ver3/root/model/dizionari/cn_var.json"
    en_model_path = "../model/modelli_creati/eng_joke.h5"
    cn_model_path = "../model/modelli_creati/chn_joke.h5"

    en_seq_len = 200
    cn_seq_len = 200
    
    en_word_to_index = load_word_to_index(en_word_json_path)
   # cn_word_to_index = load_word_to_index(cn_word_json_path)
    
    en_model = tf.keras.models.load_model(en_model_path)
   # cn_model = tf.keras.models.load_model(cn_model_path)
    
    try:
        while True:
            sentence = str(input("Please insert your sentence: "))
            if sentence.lower().strip() == "end":
                break
            if translator.lower().strip() == "google":
                en_sentence, cn_sentence = google_translate_eng_cn(sentence)
            elif translator.lower().strip() == "baidu":
                en_sentence, cn_sentence = baidu_translate_eng_cn(sentence)
            else:
                print("there are not translator requested")
            en_result = prediction(en_sentence, en_model, en_word_to_index, en_seq_len)
           # cn_result = prediction(cn_sentence, cn_model, cn_word_to_index,cn_seq_len)
            
            if en_result == 1:
                print("this is an english joke")
            else:
                print("this is not an english joke")
          #  if cn_result == 1:
           #     print("this is a chinese joke")
           # else:
          #      print("this is not a chinese joke")
    except KeyboardInterrupt:
        pass

In [30]:
main("google")

word_to_index dict restored from '/Users/rosalina_chen/Desktop/humor_recognition_ver3/root/model/dizionari/en_var.json'.
Please insert your sentence: hi how are you
this is an english joke
Please insert your sentence: My mother-in-law fell down a wishing well. I was amazed – I never knew they worked
this is an english joke
Please insert your sentence: i am 23 years old
this is an english joke
Please insert your sentence:  MediaWiki helps you collect and organize knowledge and make it available to people.
this is an english joke
Please insert your sentence: God wants spiritual fruit, not religious nuts.
this is an english joke
Please insert your sentence: That day, when they came back from school, their own son said
this is not an english joke


In [24]:
main("google") #funzione per testare

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/rosalina_chen/Desktop/humor_recognition_ver3/root/model/dizionari/en_var.json'