<a href="https://colab.research.google.com/github/TanyaAggrawal/AI_NLP/blob/main/AI_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Training an AI to create poetry using NLP

In [None]:
#import all the required libraries

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
import numpy as np

#Download the irish song dataset
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt \
    -O /tmp/irish-lyrics-eof.txt


#Making training set and labels

#1)Get data

data = open ("/tmp/irish-lyrics-eof.txt").read()

corpus = data.lower().split('\n')

#2) tokenize data

tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus) 

total_words = len(tokenizer.word_index)+1


print(tokenizer.word_index)
print(total_words)

#3)Make Sequences


input_sequences=[]
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1,len(token_list)):
    n_grams = token_list[:1+i]
    input_sequences.append(n_grams)


max_sequence_len=max(len(x) for x in (input_sequences))

#Pad Data

sequence_padding = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
sequence_padding=np.array(sequence_padding)

#predictors and labels being defined
xs,labels=sequence_padding[:,:-1],sequence_padding[:,-1]
ys =tf.keras.utils.to_categorical(labels,num_classes=total_words)

#Define, compile, fit, summerise model
model = Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words,activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
history = model.fit(xs,ys,epochs=100,verbose=1)
model.summary()
print(model)

# Plot accuracy and history 
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()
plot_graphs(history, 'accuracy')


seed_text = "I've got a bad feeling about this"
next_words = 100
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)


Building a sarcasm classifier using NLP

In [None]:
#Download the sarcastic headlines json dataset it includes headlines, urls of articles and the labels in the tmp
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

#import json library
import json

#Convert json dataset in python dataset

with open('/tmp/sarcasm.json','r') as f:
  datastore = json.load(f)

#converting json to python list

sentences=[]
labels=[]

for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

#prepare the training and testing sentences and labels

training_sentences = sentences[0:20000]
testing_sentences = sentences[20000:] 
training_labels = labels[0:20000]
testing_labels = labels[20000:]

#tokenising the training and testing sentences 

#declaring some variables

vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

#import libraries

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)

training_padded = pad_sequences(training_sequences, maxlen= max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding= padding_type, truncating=trunc_type)

#covert all the sequences and padings to numpy array
import numpy as np
training_padded = np.array(training_padded)
testing_padded = np.array(testing_padded)
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)


#Making the model...
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24,activation='relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')])

#compile Model

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

#Plot our loss and accuracy

import matplotlib.pyplot as plt
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

#Predict using our model
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences)
print(model.predict(padded))
