In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tqdm
from tqdm.keras import TqdmCallback
import re
import unicodedata

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from string import punctuation

In [3]:
# load
FILEPATH = '/content/Constitution Dataset.csv'
df = pd.read_csv(FILEPATH)

# drop NULLs & reset index
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# clean the text
def clean(txt):
    txt = txt.lower()  # lower caps
    txt = txt.translate(str.maketrans("", "", punctuation))  # remove punctuations
    txt = re.sub('\W+',' ', txt)  # remove special chars
    txt = ''.join(c for c in unicodedata.normalize('NFD', txt)  # remove accents
                  if unicodedata.category(c) != 'Mn')
    return txt

# apply
df['Preamble_clean'] = df['Preamble'].apply(lambda x: clean(x))

#view 
df.head()

Unnamed: 0,Country,Year Enacted,Scope,Length (in Words),Executive Power,Legislative Power,Judicial Independence,Number of Rights,Preamble,Preamble_clean
0,Afghanistan,2004,0.67,10227,6,0.38,2,37,"In the name of Allah, the Most Beneficent, the...",in the name of allah the most beneficent the m...
1,Albania,1998,0.61,13826,5,0.43,5,77,"We, the people of Albania, proud and aware of ...",we the people of albania proud and aware of ou...
2,Algeria,1996,0.61,10038,7,0.29,1,36,The Algerian people are a free people; and the...,the algerian people are a free people and they...
3,Andorra,1993,0.51,8740,6,0.19,3,51,"The Andorran People, with full liberty and ind...",the andorran people with full liberty and inde...
4,Angola,2010,0.8,27181,7,0.19,2,80,"We, the people of Angola, through its lawful r...",we the people of angola through its lawful rep...


In [5]:
sequence_length = 100
BATCH_SIZE = 512
EPOCHS = 20

# create a custom corpus
corpus = ' '.join(p.strip() for p in df['Preamble_clean'])

# print some stats
n_chars = len(corpus)
vocab = ''.join(sorted(set(corpus)))
print("unique_chars:", vocab)
n_unique_chars = len(vocab)
print("Number of characters:", n_chars)
print("Number of unique characters:", n_unique_chars)

unique_chars:  0123456789abcdefghijklmnopqrstuvwxyz
Number of characters: 345016
Number of unique characters: 37


In [9]:
def split_sample(sample):
    # example :
    # sequence_length is 10
    # sample is "python is a great pro" (21 length)
    # ds will equal to ('python is ', 'a') encoded as integers
    ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length]))
    for i in range(1, (len(sample)-1) // 2):
        # first (input_, target) will be ('ython is a', ' ')
        # second (input_, target) will be ('thon is a ', 'g')
        # third (input_, target) will be ('hon is a g', 'r')
        # and so on
        input_ = sample[i: i+sequence_length]
        target = sample[i+sequence_length]
        # extend the dataset with these samples by concatenate() method
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
    return ds

In [10]:
# build model
model = Sequential([
    LSTM(256, input_shape=(sequence_length, n_unique_chars), return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dense(n_unique_chars, activation="softmax"),
])

# define the model path
model_file = f"textgen-{sequence_length}.h5"
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 100, 256)          301056    
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 37)                9509      
                                                                 
Total params: 835,877
Trainable params: 835,877
Non-trainable params: 0
_________________________________________________________________


In [15]:
print("Seed:", )
print("Generated text:")
print("generated")

Seed:
Generated text:
generated
