In [174]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re

from textblob import TextBlob
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.backend import mean, sum
from keras.utils import to_categorical, plot_model
from keras.layers import Dense, Embedding, Reshape, Lambda, BatchNormalization, Dropout
from keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Flatten
from keras import optimizers, initializers

from load_glove_embeddings import load_glove_embeddings

In [None]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [2]:
def preprocess(words, to_lowercase=True, remove_punctuation=True, remove_digits=True, 
               remove_odd_chars=True, remove_stopwords=True, stem=False, spell_check=False,
               lemmatize=False):
    
    if to_lowercase:
        words = [w.lower() for w in words]
    
    if remove_punctuation:
        words = [w for w in words if not (re.match(r'^\W+$', w) != None)]
    
    if remove_digits:
        words = [w for w in words if not w.replace('.','',1).isdigit()]

    if remove_odd_chars:
        words = [re.sub(r'[^a-zA-Z0-9_]','', w) for w in words]
    
    if remove_stopwords:
        sw = set(nltk.corpus.stopwords.words("english"))
        words = [w for w in words if not w in sw]

    if spell_check:
        words = [str(TextBlob(w).correct()) for w in words]
     
    if stem:
        porter = nltk.PorterStemmer()
        words = [porter.stem(w) for w in words]

    if lemmatize:
        wordnet = nltk.WordNetLemmatizer()
        words = [wordnet.lemmatize(w) for w in words]
        
    return words

def lookup_index(docs):
    output_matrix = []
    oov = 0
    tot = 0
    for d in docs:
        indices = []
        for w in d:
            tot += 1
            if w in word2index.keys():
                indices.append(word2index[w])
            else:
                oov += 1
                indices.append(word2index['unk'])
        output_matrix.append(indices)
    return output_matrix, float(oov) / tot

In [3]:
df = pd.read_csv('C:/Users/Eanna/Documents/Masters/NLP/nlp-jcag/data/processed_data/jokes_dataset_CLEAN.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,body,id,score,title
0,0,"Now I have to say ""Leroy can you please paint ...",5tz52q,1,I hate how you cant even say black paint anymore
1,1,Pizza doesn't scream when you put it in the ov...,5tz4dd,0,What's the difference between a Jew in Nazi Ge...
2,2,...and being there really helped me learn abou...,5tz319,0,I recently went to America....
3,4,He got caught trying to sell the two books to ...,5tz1pc,0,You hear about the University book store worke...
4,5,Because the p is silent.,5tz1o1,0,Why is it unknown on how pterodactyls urinate ...


In [None]:
df.score.hist(bins=2000)
plt.xlim(0,1000)

In [4]:
docs = [str(df.title[i]) + ' ' + str(df.body[i]) for i in range(df.shape[0])]

In [6]:
word2index, embedding_matrix = load_glove_embeddings('C:/Users/Eanna/Documents/Masters/NLP/Assignment3/data/glove.6B.50d.txt', embedding_dim=50)

In [7]:
docs_cleaned = [preprocess(words.split()) for words in docs]

In [8]:
docs_encoded, oov = lookup_index(docs_cleaned)
print(oov)

0.04708820023323732


In [9]:
docs_padded = pad_sequences(docs_encoded, maxlen=60, padding='post')

In [196]:
keras.backend.clear_session()
embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1], 
                            input_length=60,
                            weights=[embedding_matrix], 
                            trainable=False,
                            mask_zero=False,
                            name='embedding_layer')

model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
#model.add(Lambda(lambda x: mean(x, axis=1)))
model.add(BatchNormalization())
model.add(Dense(512, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [187]:
keras.backend.clear_session()
embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1], 
                            input_length=60,
                            weights=[embedding_matrix], 
                            trainable=False,
                            mask_zero=False,
                            name='embedding_layer')

model = Sequential()
model.add(embedding_layer)
model.add(Reshape((60, 50, 1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [197]:
model.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 60, 50)            20000050  
_________________________________________________________________
flatten_1 (Flatten)          (None, 3000)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 3000)              12000     
_________________________________________________________________
dense_1 (Dense)              (None, 512)               1536512   
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
__________

In [60]:
#plot_model(model, 'model.png')

In [137]:
# 4 classes
df['class'] = [0 if df.score[i] == 0 \
               else 1 if (df.score[i] > 0 and df.score[i] <= 10) \
               else 2 if (df.score[i] > 10 and df.score[i] <= 30) \
               else 3 for i in range(df.shape[0])]

print(len(df[df['class']==0]) / len(df))
print(len(df[df['class']==1]) / len(df))
print(len(df[df['class']==2]) / len(df))
print(len(df[df['class']==3]) / len(df))

0.340343250526258
0.39538949952345365
0.11161470368414918
0.15265254626613914


In [181]:
# 3 classes
df['class'] = [0 if df.score[i] <= 3 \
               else 1 if (df.score[i] > 3 and df.score[i] <= 20) \
               else 2 for i in range(df.shape[0])]

print(len(df[df['class']==0]) / len(df))
print(len(df[df['class']==1]) / len(df))
print(len(df[df['class']==2]) / len(df))

0.5716910882399325
0.24087190844824158
0.1874370033118259


In [167]:
# 2 classes
df['class'] = [0 if df.score[i] < 3 \
               else 1 for i in range(df.shape[0])]

print(len(df[df['class']==0]) / len(df))
print(len(df[df['class']==1]) / len(df))

0.5199086676586145
0.48009133234138546


In [198]:
history = model.fit(docs_padded, to_categorical(df['class']), epochs=10, validation_split=0.2, batch_size=128)

Train on 116672 samples, validate on 29169 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 