# Applying Convolutions to Text
## What is a CNN?
CNNs try to capture the spatial relationships in data which are ideally suited for capturing patterns in images since images have spatial relationships in those pixels that are in the same vicinity contribute to making sense of the object.

## Detecting sarcasm in text using CNNs


### Loading the libraries and the dataset

#### 1) Importing the various libraries that we will use

In [1]:
import pandas as pd
import numpy as np
import re
import json
import gensim
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import KeyedVectors
import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D
import h5py

#### 2) Reading the data

In [2]:
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

In [3]:
data = list(parse_data('Sarcasm_Headlines_Dataset_v2.json'))
df = pd.DataFrame(data)

### Performing basic data analysis and preprocessing our data
#### Basic Data Understanding

In [4]:
df.head(5)

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


#### Eliminating article_link column from the data

In [5]:
df.pop('article_link')

0        https://www.theonion.com/thirtysomething-scien...
1        https://www.huffingtonpost.com/entry/donna-edw...
2        https://www.huffingtonpost.com/entry/eat-your-...
3        https://local.theonion.com/inclement-weather-p...
4        https://www.theonion.com/mother-comes-pretty-c...
                               ...                        
28614    https://www.theonion.com/jews-to-celebrate-ros...
28615    https://local.theonion.com/internal-affairs-in...
28616    https://www.huffingtonpost.com/entry/andrew-ah...
28617    https://www.theonion.com/mars-probe-destroyed-...
28618    https://www.theonion.com/dad-clarifies-this-no...
Name: article_link, Length: 28619, dtype: object

In [6]:
#Checking the size of the data
len(df)

28619

#### Data preprocessing using the same pipeline that we've used

In [7]:
def text_clean(corpus):
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

def stopwords_removal(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [8]:
headlines = preprocess(df['headline'], lemmatization = True, remove_stopwords = True)

  cleaned_corpus = pd.Series()


### Loading the Word2Vec model and vectorizing our data

#### 1) Loading the model

In [9]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

#### 2) Defining model parameters and performing Data Vectorization and Standardization

In [10]:
MAX_LENGTH = 10
VECTOR_SIZE = 300

In [11]:
def vectorize_data(data):

    vectors = []
    
    padding_vector = [0.0] * VECTOR_SIZE
    
    for i, data_point in enumerate(data):
        data_point_vectors = []
        count = 0
        
        tokens = data_point.split()
        
        for token in tokens:
            if count >= MAX_LENGTH:
                break
            if token in model.index_to_key:
                data_point_vectors.append(model[token])
            count = count + 1
        
        if len(data_point_vectors) < MAX_LENGTH:
            to_fill = MAX_LENGTH - len(data_point_vectors)
            for _ in range(to_fill):
                data_point_vectors.append(padding_vector)
        
        vectors.append(data_point_vectors)
        
    return vectors

In [12]:
vectorized_headlines = vectorize_data(headlines)

#### 3) Adding a validation to ensure that the 10 vectors are present for each headline

In [13]:
for i, vec in enumerate(vectorized_headlines):
    if len(vec) != MAX_LENGTH:
        print(i)

### Splitting our dataset into train and test sets

#### 1) Splitting data into sets

In [14]:
train_div = math.floor(0.7 * len(vectorized_headlines))
train_div

20033

In [15]:
X_train = vectorized_headlines[:train_div]
y_train = df['is_sarcastic'][:train_div]
X_test = vectorized_headlines[train_div:]
y_test = df['is_sarcastic'][train_div:]

print('The size of X_train is:', len(X_train), '\nThe size of y_train is:', len(y_train),
      '\nThe size of X_test is:', len(X_test), '\nThe size of y_test is:', len(y_test))

The size of X_train is: 20033 
The size of y_train is: 20033 
The size of X_test is: 8586 
The size of y_test is: 8586


#### 2) Reshaping our data in order to convert it into the form expected by our CNN model

In [16]:
X_train = np.reshape(X_train, (len(X_train), MAX_LENGTH, VECTOR_SIZE))
X_test = np.reshape(X_test, (len(X_test), MAX_LENGTH, VECTOR_SIZE))
y_train = np.array(y_train)
y_test = np.array(y_test)

### Building the model

#### 1) Defining the hyperparameters of the network

In [17]:
FILTERS=8
KERNEL_SIZE=3
HIDDEN_LAYER_1_NODES=10
HIDDEN_LAYER_2_NODES=5
DROPOUT_PROB=0.35
NUM_EPOCHS=10
BATCH_SIZE=50

#### 2) Defining the convolutional layer

In [18]:
model = Sequential()

model.add(Conv1D(FILTERS,
                 KERNEL_SIZE,
                 padding='same',
                 strides=1,
                 activation='relu', 
                 input_shape = (MAX_LENGTH, VECTOR_SIZE)))

In [19]:
# Defining the pooling layer
model.add(GlobalMaxPooling1D())

#### 3) Defining the feedforward neural network

In [20]:
model.add(Dense(HIDDEN_LAYER_1_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(HIDDEN_LAYER_2_NODES, activation='relu'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(1, activation='sigmoid'))

#### 4) Looking at the summary of our model

In [21]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 10, 8)             7208      
                                                                 
 global_max_pooling1d (Globa  (None, 8)                0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                90        
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 5)                 55        
                                                                 
 dropout_1 (Dropout)         (None, 5)                 0         
                                                        

#### 5) Building our model using 'compile'

In [22]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#### 6) Training our model

In [23]:
training_history = model.fit(X_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluating and saving our model

#### 1) Evaluating our model on the test data

In [24]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.7555


#### 2) Saving our model

In [25]:
model_structure = model.to_json()
with open("Output Files/sarcasm_detection_model_cnn.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("Output Files/sarcasm_detection_model_cnn.h5")

# Chapter 10: Capturing Temporal Relationships in Text

## Building a text generator using LSTMs
#### 1) Importing the necessary libraries

In [26]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding

#### 2) Loading the dataset

In [27]:
data = pd.read_csv('hotel_data.csv')

handle: hotel_data.csv
hotel_data.csv


#### 3) Checking out the data

In [28]:
data.head(5)

Unnamed: 0,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,is_value_plus,...,property_type,qts,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id
0,Hardasji Ki Magri,Udaipur,India,2016-06-21,{{facility}},|Zion Home Stay is located in a city that sets...,1 star,,{{value}},no,...,Hotel,,2016-06-21 04:06:50 +0000,,/5,4.5,makemytrip,,,78ddf880bd7937d384ff278cc5b39d6e
1,Near Nai Gaon,Udaipur,India,2016-06-21,{{facility}},| Araliayas Resorts is a 3 star hotel located ...,3 star,,{{value}},no,...,Hotel,,2016-06-21 04:06:50 +0000,,/5,4.5,makemytrip,,,9f9f9cbb2f7df8089b63d5cdeb257944
2,Near Bagore Ki Haveli,Udaipur,India,2016-06-21,{{facility}},|A 2 star property is located at 24 km from Ma...,2 star,,{{value}},no,...,Hotel,,2016-06-21 04:06:50 +0000,,/5,,makemytrip,,,b314bb7fa8bfb1ed306f517be21d729e
3,Dabok,Udaipur,India,2016-06-21,Airport Transfer|Car rental|Conference Hall|Cu...,|SNP House Airport Hotel And Restaurant is loa...,1 star,//imghtlak.mmtcdn.com/images/hotels/2014071815...,,no,...,Hotel,,2016-06-21 04:06:50 +0000,`standard,/5,,makemytrip,,,e6f5bb3c2d76a78d978b9ceb0e31ec56
4,East Udaipur,Udaipur,India,2016-06-21,{{facility}},| Hotel Pichola Haveli is situated in the beau...,2 star,,{{value}},no,...,Hotel,,2016-06-21 04:06:50 +0000,,/5,3.7,makemytrip,,,63072c301427b6ca450d31eea127bcf0


#### 4) Checking how many hotels per city are available

In [29]:
data.city.value_counts()

NewDelhiAndNCR    1163
Goa               1122
Mumbai             543
Jaipur             534
Bangalore          512
                  ... 
Kollur               1
Madla                1
Jeypore              1
Jispa                1
Haldia               1
Name: city, Length: 770, dtype: int64

#### 5) Focusing on data for Mumbai

In [30]:
array = ['Mumbai']
data = data.loc[data['city'].isin(array)]

#### 6) Checking filtered data

In [31]:
data.head(5)

Unnamed: 0,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,is_value_plus,...,property_type,qts,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id
294,Charai,Mumbai,India,2016-08-28,Doctor on Call|Front desk|Laundry Service|Park...,"Nestled in Mumbai, a city with strong historic...",3,,Bathroom Toiletries|Attached Bathroom|Hot & Co...,no,...,Hotel,2016-08-28 16:13:39 +0000,2016-08-28 16:13:39 +0000,Ac Superior Room|A/c Standard Rooms Double Occ...,,,makemytrip,Maharashtra,,d78fae90ef2e1b5c2dfd547c61763a25
309,Andheri (East),Mumbai,India,2016-08-28,Air Conditioned|Airport Transfer|Conference Ha...,3 km from Chhatrapati Shivaji International Ai...,2,,Bathroom Toiletries|Daily Newspaper|Kitchenett...,no,...,Hotel,2016-08-28 16:13:39 +0000,2016-08-28 16:13:39 +0000,Executive|Deluxe,,,makemytrip,Maharasta,Location:3.2/5 | Hospitality:3.1/5 | Facilitie...,030865f741982beb373efddecdc6d6c3
321,Khar,Mumbai,India,2016-08-28,Airport/Rlwy Stn Transfer|Bar|Conference Hall|...,Location Hotel Royal Garden is situated on Juh...,3,,Electronic Safe|Bathroom Toiletries|Daily News...,no,...,Hotel,2016-08-28 16:13:39 +0000,2016-08-28 16:13:39 +0000,Superior Executive,,,makemytrip,Maharashtra,Location:4.5/5 | Hospitality:3.4/5 | Facilitie...,a1ced509350038775a7700ec67796bc2
334,Andheri (East),Mumbai,India,2016-08-28,24 Hour Check in-Icon|24 hour reception|24 hou...,City Guest House is a beautiful property locat...,2,,Bathroom Toiletries|Hot/cold Water|Attached Ba...,no,...,Hotel,2016-08-28 16:13:39 +0000,2016-08-28 16:13:39 +0000,Standard Room|Deluxe Room|Triple Deluxe Room|S...,,,makemytrip,Maharashtra,Location:4.3/5 | Hospitality:3.8/5 | Facilitie...,f2820ae7707668ed6906bb227921f720
1238,Andheri (East),Mumbai,,2016-08-22,24 Hour Check in-Icon|24 hour reception|24 hou...,Sai Residency Hotel is situated in the City of...,2,,Bathroom Toiletries|Attached Bathroom|Hot & Co...,no,...,Hotel,2016-08-22 22:10:53 +0000,2016-08-22 22:10:53 +0000,Deluxe Dbl Air Cooled,,,makemytrip,MAHARASHTRA,,b4af24952027ffbcd85a91bb6fe23f5d


#### 7) Keeping only hotel_overview column

In [32]:
data = data.hotel_overview
data = data.dropna()

#### Data preprocessing

#### 8) Preprocessing our data

In [33]:
stop = set(stopwords.words('english'))
def stopwords_removal(data_point):
    data = [x for x in data_point.split() if x not in stop]
    return data

#### 9) Performing data cleansing

In [34]:
def clean_data(data):
    cleaned_data = []
    all_unique_words_in_each_description = []
    for entry in data:
        entry = re.sub(pattern='[^a-zA-Z]',repl=' ',string = entry)
        entry = re.sub(r'\b\w{0,1}\b', repl=' ',string = entry)
        entry = entry.lower()
        entry = stopwords_removal(entry)
        cleaned_data.append(entry)
        unique = list(set(entry))
        all_unique_words_in_each_description.extend(unique)
    return cleaned_data, all_unique_words_in_each_description

#### 10) Extracting unique words in our data 

In [35]:
def unique_words(data):
    unique_words = set(all_unique_words_in_each_description)
    return unique_words, len(unique_words)

#### 11) Applying data cleansing and finding the unique words

In [36]:
cleaned_data, all_unique_words_in_each_description = clean_data(data)
unique_words, length_of_unique_words = unique_words(all_unique_words_in_each_description)

#### 12) Checking out the Cleaned data

In [37]:
cleaned_data[0]

['nestled',
 'mumbai',
 'city',
 'strong',
 'historical',
 'links',
 'wonderful',
 'british',
 'architecture',
 'museums',
 'beaches',
 'places',
 'worship',
 'true',
 'galaxy',
 'stars',
 'bollywood',
 'reigns',
 'supreme',
 'hotel',
 'divya',
 'international',
 'delightful',
 'leisure',
 'absolute',
 'blend',
 'service',
 'charm',
 'efficiency',
 'hotel',
 'offers',
 'facilities',
 'like',
 'front',
 'desk',
 'parking',
 'laundry',
 'doctor',
 'call',
 'many',
 'aims',
 'extend',
 'best',
 'possible',
 'hospitality',
 'experience',
 'revered',
 'customers',
 'hotel',
 'located',
 'distance',
 'workshop',
 'bus',
 'stop',
 'km',
 'chhatrapati',
 'shivaji',
 'international',
 'airport',
 'km',
 'chhatrapati',
 'shivaji',
 'terminus',
 'guests',
 'head',
 'prominent',
 'tourist',
 'attractions',
 'like',
 'sanjay',
 'gandhi',
 'national',
 'park',
 'kidzania',
 'mumbai',
 'haji',
 'ali',
 'mosque',
 'iskon',
 'temple',
 'shree',
 'siddhivinayak',
 'temple',
 'many',
 'shoppers',
 'fun',

#### 13) Total number of unique words in the data

In [38]:

length_of_unique_words

3395

#### 14)  Building a mapping of words to an index and a reverse mapping from an index to a word

In [39]:
def build_indices(unique_words):
    word_to_idx = {}
    idx_to_word = {}
    for i, word in enumerate(unique_words):
        word_to_idx[word] = i
        idx_to_word[i] = word
    return word_to_idx, idx_to_word

#### 15) Building our indices

In [40]:

word_to_idx, idx_to_word = build_indices(unique_words)

#### 16) Preparing our training corpus

In [41]:
def prepare_corpus(corpus, word_to_idx):
    
    sequences = []
    for line in corpus:
        tokens = line
        for i in range(1, len(tokens)):
            i_gram_sequence = tokens[:i+1]
            i_gram_sequence_ids = []
            
            for j, token in enumerate(i_gram_sequence):
                i_gram_sequence_ids.append(word_to_idx[token])
                
            sequences.append(i_gram_sequence_ids)
    
    return sequences

In [42]:
# Calling the prepare_corpus method
sequences = prepare_corpus(cleaned_data, word_to_idx)
max_sequence_len = max([len(x) for x in sequences])

#### 17) Validating what we built just now

In [43]:
print(sequences[0])
print(sequences[1])

[948, 1516]
[948, 1516, 13]


#### 18) Words that are mapped to these indices

In [44]:
print(idx_to_word[1647])
print(idx_to_word[867])
print(idx_to_word[1452])

iris
channel
enterprises


#### 19) Some metadata about the sequences built

In [45]:
# Total number of sequences
len(sequences)

51836

In [46]:
# Size of the longest sequence
max_sequence_len

308

## Building the text generator

#### 1) Defining build_input_data which splits our data into independent and dependent variables and also for padding the input samples

In [47]:
def build_input_data(sequences, max_sequence_len, length_of_unique_words):
    sequences = np.array(pad_sequences(sequences, maxlen = max_sequence_len, padding = 'pre'))
    X = sequences[:,:-1]
    y = sequences[:,-1]
    y = np_utils.to_categorical(y, length_of_unique_words)
    return X, y

In [48]:
X, y = build_input_data(sequences, max_sequence_len, length_of_unique_words)

#### 2) Defining and building our model

In [49]:
def create_model(max_sequence_len, length_of_unique_words):
    model = Sequential()
    model.add(Embedding(length_of_unique_words, 10, input_length=max_sequence_len - 1))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(length_of_unique_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

#### 3) Creating the model and checking its summary

In [50]:
model = create_model(max_sequence_len, length_of_unique_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 307, 10)           33950     
                                                                 
 lstm (LSTM)                 (None, 128)               71168     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 3395)              437955    
                                                                 
Total params: 543,073
Trainable params: 543,073
Non-trainable params: 0
_________________________________________________________________


#### 4) Training the model

In [51]:
model.fit(X, y, batch_size = 512, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100

#### 5) Testing the model

In [None]:
def generate_text(seed_text, next_words, model, max_seq_len):
    for _ in range(next_words):
        cleaned_data = clean_data([seed_text])
        sequences= prepare_corpus(cleaned_data[0], word_to_idx)
        sequences = pad_sequences([sequences[-1]], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict_classes(sequences, verbose=0)
        output_word = ''
        output_word = idx_to_word[predicted[0]]            
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

In [None]:
print(generate_text("in Mumbai there we need", 30, model, max_sequence_len))

In [None]:
print(generate_text("The beauty of the city", 30, model, max_sequence_len))