In [3]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer

import contractions
import unicodedata
import re
import inflect
import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import matthews_corrcoef, confusion_matrix

import matplotlib.pyplot as plt

import logging
logging.basicConfig(level=logging.INFO)

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("public_dev.csv")

In [5]:
def denoise_text(text):
    text = contractions.fix(text)
    return text

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize_text(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    #words = replace_numbers(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    words = lemmatize_verbs(words)
    return words

def tokenize(text):
    return nltk.word_tokenize(text)

def text_prepare(text):
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text

In [12]:
df_train['text'] = [text_prepare(x) for x in df_train['text']]
df_test['text'] = [text_prepare(x) for x in df_test['text']]

In [13]:
def prepare_model_input(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    pickle.dump(tokenizer, open('text_tokenizer.pkl', 'wb'))
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train_Glove = text[0:len(X_train), ]
    X_test_Glove = text[len(X_train):, ]
    embeddings_dict = {}
    f = open("glove.6B.300d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_dict[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_dict))
    return (X_train_Glove, X_test_Glove, word_index, embeddings_dict)

In [14]:
def build_bilstm(word_index, embeddings_dict, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=300, dropout=0.5, hidden_layer = 3, lstm_node = 32):
    # Initialize a sequebtial model
    model = Sequential()
    # Make the embedding matrix using the embedding_dict
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
            
    # Add embedding layer
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    # Add hidden layers 
    for i in range(0,hidden_layer):
        # Add a bidirectional lstm layer
        model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))
        # Add a dropout layer after each lstm layer
        model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(lstm_node, recurrent_dropout=0.2)))
    model.add(Dropout(dropout))
    # Add the fully connected layer with 256 nurons and relu activation
    model.add(Dense(256, activation='relu'))
    # Add the output layer with softmax activation since we have 2 classes
    model.add(Dense(nclasses, activation='softmax'))
    # Compile the model using sparse_categorical_crossentropy
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

**is_humor**

In [15]:
X_train = df_train.text
y_train = df_train.is_humor
X_test = df_test.text
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train,X_test)
print("Done!")
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Preparing model input ...
Found 9625 unique tokens.
(9000, 500)
Total 340956 word vectors.
Done!
Building Model!
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 300)          2887800   
_________________________________________________________________
bidirectional (Bidirectional (None, 500, 64)           85248     
_________________________________________________________________
dropout (Dropout)            (None, 500, 64)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 500, 64)           24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 500, 64)           24832     
_________

In [16]:
def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*(precision*recall))/(precision+recall)
    return {
        "mcc": mcc,
        "true positive": tp,
        "true negative": tn,
        "false positive": fp,
        "false negative": fn,
        "pricision" : precision,
        "recall" : recall,
        "F1" : f1,
        "accuracy": (tp+tn)/(tp+tn+fp+fn)
    }
def compute_metrics(labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [17]:
history = model.fit(X_train_Glove, y_train,
                           epochs=5,
                           batch_size=128,
                           verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
print("\n Evaluating Model ... \n")
predicted = model.predict_classes(X_test_Glove)


 Evaluating Model ... 

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [19]:
predicted

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,

**Humor_Contro**

In [20]:
humor = df_train['is_humor']==1
df_train_new = df_train[humor]
df_train_new.head(10)

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,ten best stat nobody ev com clos elev walk roo...,1,2.42,1.0,0.2
1,2,man insert advert class wif want next day rece...,1,2.5,1.0,1.1
2,3,many men tak op beer non op tim bring couch,1,1.95,0.0,2.4
3,4,tell mom hit 1200 twit follow point broth own ...,1,2.11,1.0,0.0
4,5,ros dead lov fak wed bas fun cak,1,2.78,0.0,0.1
7,8,origin tru on kind also hold glass whit win lo...,1,1.79,1.0,0.0
11,12,diff mormon man muslim man mormon man get 72 v...,1,2.2,0.0,2.95
12,13,stop cal 911 run toilet pap ye run toilet pap ...,1,1.5,1.0,0.0
13,14,march streets shout peopl civil disobedy drink...,1,2.16,1.0,0.2
17,18,mak send creepy ad con two con adult rid tande...,1,1.78,1.0,0.2


In [21]:
df_train = df_train_new.reset_index(drop=True)
df_train.humor_controversy = df_train.humor_controversy.astype('int64')

In [23]:
df_train.dtypes

id                     int64
text                  object
is_humor               int64
humor_rating         float64
humor_controversy      int64
offense_rating       float64
dtype: object

In [24]:
X_train = df_train.text
y_train = df_train.humor_controversy
X_test = df_test.text
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train,X_test)
print("Done!")
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Preparing model input ...
Found 7193 unique tokens.
(5932, 500)
Total 400000 word vectors.
Done!
Building Model!
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 300)          2158200   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 500, 64)           85248     
_________________________________________________________________
dropout_4 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 500, 64)           24832     
_________________________________________________________________
dropout_5 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 500, 64)           24832     
_______

In [25]:
history = model.fit(X_train_Glove, y_train,
                           epochs=5,
                           batch_size=128,
                           verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
print("\n Evaluating Model ... \n")
predicted_contro = model.predict_classes(X_test_Glove)


 Evaluating Model ... 



In [27]:
predicted_contro

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,

Humor rating

In [33]:
def build_bilstm(word_index, embeddings_dict, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=300, dropout=0.5, hidden_layer = 3, lstm_node = 32):
    # Initialize a sequebtial model
    model = Sequential()
    # Make the embedding matrix using the embedding_dict
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
            
    # Add embedding layer
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    # Add hidden layers 
    # for i in range(0,hidden_layer):
    #     # Add a bidirectional lstm layer
    #     model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))
    #     # Add a dropout layer after each lstm layer
    #     model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))
    model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))
    model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.2)))
    model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(lstm_node)))
    model.add(Dropout(dropout))
    # Add the fully connected layer with 256 nurons and relu activation
    model.add(Dense(256))
    # Add the output layer with softmax activation since we have 2 classes
    model.add(Dense(1))
    # Compile the model using sparse_categorical_crossentropy
    model.compile(loss='mse',
                      optimizer='adam')
    return model

In [34]:
X_train = df_train.text
y_train = df_train.is_humor
X_test = df_test.text
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train,X_test)
print("Done!")
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Preparing model input ...
Found 7193 unique tokens.
(5932, 500)
Total 400000 word vectors.
Done!
Building Model!
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 300)          2158200   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 500, 64)           85248     
_________________________________________________________________
dropout_10 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 500, 64)           24832     
_________________________________________________________________
dropout_11 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 500, 64)           24832     
_______

In [35]:
history = model.fit(X_train_Glove, y_train,
                           epochs=5,
                           batch_size=128,
                           verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
print("\n Evaluating Model ... \n")
predicted_humor_rating = model.predict(X_test_Glove)


 Evaluating Model ... 



In [37]:
predicted_humor_rating

array([[1.0267284 ],
       [1.0285598 ],
       [1.0281565 ],
       [1.0271419 ],
       [1.0281172 ],
       [1.0278009 ],
       [1.0239993 ],
       [1.0274472 ],
       [1.0187553 ],
       [1.0255113 ],
       [1.029445  ],
       [0.95011586],
       [1.0276228 ],
       [1.0010127 ],
       [1.0255418 ],
       [1.0312151 ],
       [1.0271887 ],
       [1.0298283 ],
       [1.0326444 ],
       [1.0287262 ],
       [1.0263591 ],
       [1.0302538 ],
       [1.0322424 ],
       [1.0306795 ],
       [1.0319101 ],
       [1.0275098 ],
       [1.0270218 ],
       [1.0322769 ],
       [1.031878  ],
       [1.0276121 ],
       [1.0322683 ],
       [1.0297045 ],
       [1.0322543 ],
       [1.0293673 ],
       [1.0324184 ],
       [1.0259067 ],
       [1.0287927 ],
       [1.0318953 ],
       [1.0231292 ],
       [1.0222936 ],
       [1.013496  ],
       [1.0252734 ],
       [1.0298206 ],
       [1.0305101 ],
       [1.0282704 ],
       [1.0299212 ],
       [1.0318307 ],
       [1.027

In [38]:
predicted_humor_rating = predicted_humor_rating.flatten()

In [39]:
predicted_humor_rating

array([1.0267284 , 1.0285598 , 1.0281565 , 1.0271419 , 1.0281172 ,
       1.0278009 , 1.0239993 , 1.0274472 , 1.0187553 , 1.0255113 ,
       1.029445  , 0.95011586, 1.0276228 , 1.0010127 , 1.0255418 ,
       1.0312151 , 1.0271887 , 1.0298283 , 1.0326444 , 1.0287262 ,
       1.0263591 , 1.0302538 , 1.0322424 , 1.0306795 , 1.0319101 ,
       1.0275098 , 1.0270218 , 1.0322769 , 1.031878  , 1.0276121 ,
       1.0322683 , 1.0297045 , 1.0322543 , 1.0293673 , 1.0324184 ,
       1.0259067 , 1.0287927 , 1.0318953 , 1.0231292 , 1.0222936 ,
       1.013496  , 1.0252734 , 1.0298206 , 1.0305101 , 1.0282704 ,
       1.0299212 , 1.0318307 , 1.027678  , 1.0223204 , 1.0262834 ,
       1.0315074 , 0.99708533, 1.0252414 , 1.0199237 , 1.027121  ,
       1.0296991 , 1.0317042 , 1.0321709 , 1.0331868 , 1.0296189 ,
       1.0272182 , 1.0249397 , 1.026282  , 1.0267897 , 1.027406  ,
       1.0303142 , 1.0116384 , 1.0240238 , 1.0313774 , 1.0291632 ,
       1.0307662 , 1.0309094 , 1.0242192 , 1.0318857 , 1.02892

offense_rating

In [40]:
X_train = df_train.text
y_train = df_train.offense_rating
X_test = df_test.text
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train,X_test)
print("Done!")
print("Building Model!")
model = build_bilstm(word_index, embeddings_dict, 2)
model.summary()

Preparing model input ...
Found 7193 unique tokens.
(5932, 500)
Total 400000 word vectors.
Done!
Building Model!
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 300)          2158200   
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 500, 64)           85248     
_________________________________________________________________
dropout_14 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 500, 64)           24832     
_________________________________________________________________
dropout_15 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 500, 64)           24832     
_______

In [41]:
history = model.fit(X_train_Glove, y_train,
                           epochs=5,
                           batch_size=128,
                           verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
print("\n Evaluating Model ... \n")
predicted_offense_rating = model.predict(X_test_Glove)


 Evaluating Model ... 



In [43]:
predicted_offense_rating

array([[3.0271165 ],
       [1.896576  ],
       [2.1201699 ],
       [3.2450128 ],
       [1.7613202 ],
       [1.7344034 ],
       [2.6720512 ],
       [1.1171514 ],
       [0.6002183 ],
       [0.5154519 ],
       [0.33436006],
       [0.12961584],
       [1.124236  ],
       [0.42192784],
       [0.7212789 ],
       [0.2723115 ],
       [0.37618116],
       [2.1258643 ],
       [0.319362  ],
       [0.15746091],
       [0.37909022],
       [2.5595894 ],
       [0.5490056 ],
       [0.22213979],
       [0.6257827 ],
       [0.23107436],
       [0.19846725],
       [0.47175846],
       [0.77722335],
       [0.64708066],
       [0.19188762],
       [1.4412674 ],
       [0.22845186],
       [0.6946441 ],
       [0.35655954],
       [0.23574781],
       [2.3164427 ],
       [2.7892802 ],
       [0.14357847],
       [0.15241799],
       [1.6423002 ],
       [0.3616983 ],
       [0.43435806],
       [0.98469675],
       [0.89692354],
       [0.15364383],
       [0.41191095],
       [0.324

In [44]:
predicted_offense_rating = predicted_offense_rating.flatten()

In [45]:
predicted_offense_rating

array([3.0271165 , 1.896576  , 2.1201699 , 3.2450128 , 1.7613202 ,
       1.7344034 , 2.6720512 , 1.1171514 , 0.6002183 , 0.5154519 ,
       0.33436006, 0.12961584, 1.124236  , 0.42192784, 0.7212789 ,
       0.2723115 , 0.37618116, 2.1258643 , 0.319362  , 0.15746091,
       0.37909022, 2.5595894 , 0.5490056 , 0.22213979, 0.6257827 ,
       0.23107436, 0.19846725, 0.47175846, 0.77722335, 0.64708066,
       0.19188762, 1.4412674 , 0.22845186, 0.6946441 , 0.35655954,
       0.23574781, 2.3164427 , 2.7892802 , 0.14357847, 0.15241799,
       1.6423002 , 0.3616983 , 0.43435806, 0.98469675, 0.89692354,
       0.15364383, 0.41191095, 0.32432058, 0.1038937 , 0.8024137 ,
       0.17779343, 0.6845093 , 0.17933166, 0.3659422 , 0.82341516,
       1.0088638 , 0.33626768, 0.49635512, 1.4815007 , 0.31727186,
       0.20426294, 0.7184721 , 2.4581704 , 0.84858996, 0.19902207,
       0.2223233 , 0.83179986, 0.1372609 , 0.44449526, 0.20384356,
       0.67849123, 0.4172039 , 1.7779578 , 0.38052392, 0.64846

In [46]:
submit=pd.DataFrame({'id':df_test['id'].values.tolist(),'is_humor':predicted, 
                     'humor_rating':predicted_humor_rating, 'humor_controversy':predicted_contro,
                     'offense_rating':predicted_offense_rating})

In [47]:
submit

Unnamed: 0,id,is_humor,humor_rating,humor_controversy,offense_rating
0,8001,1,1.026728,1,3.027117
1,8002,1,1.028560,0,1.896576
2,8003,1,1.028157,0,2.120170
3,8004,1,1.027142,0,3.245013
4,8005,1,1.028117,0,1.761320
...,...,...,...,...,...
995,8996,1,1.028061,0,0.411660
996,8997,1,1.030412,0,0.442185
997,8998,1,1.029823,0,3.048477
998,8999,1,1.020417,0,0.143420


In [48]:
submit.to_csv('submission_300_all.csv', index=False)