<a href="https://colab.research.google.com/github/RehanKhn/Boosting-Buddy-A-Conversational-Agent-for-Improvement-of-Mental-Health/blob/main/TextGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.embeddings import Embedding
import string
import re
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
downloaded = drive.CreateFile({'id':'1mrhXtn3Dq_XXZ4fW_GlSOn1upXWzNsH4'})
downloaded.GetContentFile('TestRU.txt')

In [4]:
file = open("TestRU.txt").read()

In [5]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    stopwords=[]
    filtered = filter(lambda token: token not in stopwords, tokens)
    return " ".join(filtered)

In [6]:
processed_inputs = tokenize_words(file)

In [7]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [8]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 20017
Total vocab: 29


In [9]:
seq_length = 100
x_data = []
y_data = []

In [10]:
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [11]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 19917


In [12]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [13]:
y = np_utils.to_categorical(y_data)

In [14]:
model = Sequential()
model.add(LSTM(256,input_shape=(X.shape[1],X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [16]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [17]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=desired_callbacks)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.87046, saving model to model_weights_saved.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.87046 to 2.82707, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.82707 to 2.82148, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.82148 to 2.81848, saving model to model_weights_saved.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.81848 to 2.80800, saving model to model_weights_saved.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.80800 to 2.75418, saving model to model_weights_saved.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.75418 to 2.62809, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.62809 to 2.50653, saving model to model_weights_saved.hdf5
Epoch 9/20

Epoch 00009: loss improved from 2.50653 to 2.40394, saving model to model_weights_saved.hdf5
Epoch 10/20

Epoch 00010: loss improved from 2.40394 to 2.3

<tensorflow.python.keras.callbacks.History at 0x7f9f9023c710>

In [18]:
train_acc = model.evaluate(X, y, verbose=1)



In [19]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [20]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [21]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  tha aur mjhy sab kuch khud seekhna tha anger meri summer job mein meri kesi ny madad nahi ki aur mu "


In [22]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

jhe pars kr lila jis me me ne acha naii ka ha tha fear me pe kari sa mara tha fear me pe kisi sa milna hona joy jb mere bacha tha joy jb mere bacha tha joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb mere bacha paida hoa joy jb m