In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping

In [90]:
df = pd.read_csv("/home/suhas/Desktop/office/Data Science/Notes/Data Analysis/Data/ArticlesApril2017.csv")

In [91]:
df.head()

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...
1,,58def3237c459f24986d7c84,823,By GAIL COLLINS,article,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",3,OpEd,23,2017-04-01 00:23:58,Unknown,Meet the gang from under the bus.,The New York Times,Op-Ed,https://www.nytimes.com/2017/03/31/opinion/and...
2,,58def9f57c459f24986d7c90,575,By THE EDITORIAL BOARD,article,Venezuela’s Descent Into Dictatorship,"['Venezuela', 'Politics and Government', 'Madu...",3,Editorial,22,2017-04-01 00:53:06,Unknown,A court ruling annulling the legislature’s aut...,The New York Times,Editorial,https://www.nytimes.com/2017/03/31/opinion/ven...
3,,58defd317c459f24986d7c95,1374,By MICHAEL POWELL,article,Stain Permeates Basketball Blue Blood,"['Basketball (College)', 'University of North ...",3,Sports,1,2017-04-01 01:06:52,College Basketball,"For two decades, until 2013, North Carolina en...",The New York Times,News,https://www.nytimes.com/2017/03/31/sports/ncaa...
4,,58df09b77c459f24986d7ca7,708,By DEB AMLEN,article,Taking Things for Granted,['Crossword Puzzles'],3,Games,0,2017-04-01 02:00:14,Unknown,In which Howard Barkin and Will Shortz teach u...,The New York Times,News,https://www.nytimes.com/2017/03/31/crosswords/...


In [92]:
# Step 2: Extract non-null snippets
texts = df['snippet'].dropna().tolist()

In [93]:
texts

['One of the largest photo displays in Times history, of people who have fled Boko Haram, required an effort just as immense.',
 'Meet the gang from under the bus.',
 'A court ruling annulling the legislature’s authority represents a dangerous turning point for Venezuela.',
 'For two decades, until 2013, North Carolina engaged in academic improprieties that benefited its student-athletes. The repercussions are still being felt.',
 'In which Howard Barkin and Will Shortz teach us a lesson.',
 'In this visually ravishing production, Bobby Cannavale steps into a part that has been waiting for him for decades.',
 'For the author, this gleaming, complex city in British Columbia is “a rolling, improvisatory work in progress.”',
 'About $13 million has been paid out over the years to address complaints from women about Mr. O’Reilly’s behavior. He denies the claims have merit.',
 'Getting rid of mice isn’t easy, but it’s a co-op board’s responsibility if a building becomes infested.',
 'A prop

In [94]:
# Combine all snippets into one corpus with spaces
corpus = " ".join(texts)
print(f"corpus length:{len(corpus)} characters.")

corpus length:101940 characters.


In [95]:
# Step 3: Get unique characters and create mappings

chars = sorted(list(set(corpus)))
char_to_index = {ch:i for i,ch in enumerate(chars)}
index_to_char = {i:ch for i,ch in enumerate(chars)}
num_chars = len(chars)
print(f"Total unique characters:{num_chars}")

Total unique characters:85


In [96]:
#Step 4: Define sequence length and prepare training data
max_len = 40  # Number of characters in each input sequence
X = []
y = []

In [97]:
# Create sequences of max_len characters and their next character
for i in range(len(corpus)- max_len):
    seq = corpus[i:i + max_len]
    target = corpus[i + max_len]
    X.append([char_to_index[ch] for ch in seq])
    y.append(char_to_index[target])

In [98]:
X

[[38,
  63,
  54,
  0,
  64,
  55,
  0,
  69,
  57,
  54,
  0,
  61,
  50,
  67,
  56,
  54,
  68,
  69,
  0,
  65,
  57,
  64,
  69,
  64,
  0,
  53,
  58,
  68,
  65,
  61,
  50,
  74,
  68,
  0,
  58,
  63,
  0,
  43,
  58,
  62],
 [63,
  54,
  0,
  64,
  55,
  0,
  69,
  57,
  54,
  0,
  61,
  50,
  67,
  56,
  54,
  68,
  69,
  0,
  65,
  57,
  64,
  69,
  64,
  0,
  53,
  58,
  68,
  65,
  61,
  50,
  74,
  68,
  0,
  58,
  63,
  0,
  43,
  58,
  62,
  54],
 [54,
  0,
  64,
  55,
  0,
  69,
  57,
  54,
  0,
  61,
  50,
  67,
  56,
  54,
  68,
  69,
  0,
  65,
  57,
  64,
  69,
  64,
  0,
  53,
  58,
  68,
  65,
  61,
  50,
  74,
  68,
  0,
  58,
  63,
  0,
  43,
  58,
  62,
  54,
  68],
 [0,
  64,
  55,
  0,
  69,
  57,
  54,
  0,
  61,
  50,
  67,
  56,
  54,
  68,
  69,
  0,
  65,
  57,
  64,
  69,
  64,
  0,
  53,
  58,
  68,
  65,
  61,
  50,
  74,
  68,
  0,
  58,
  63,
  0,
  43,
  58,
  62,
  54,
  68,
  0],
 [64,
  55,
  0,
  69,
  57,
  54,
  0,
  61,
  50,
  67,
  56,
 

In [99]:
y

[54,
 68,
 0,
 57,
 58,
 68,
 69,
 64,
 67,
 74,
 8,
 0,
 64,
 55,
 0,
 65,
 54,
 64,
 65,
 61,
 54,
 0,
 72,
 57,
 64,
 0,
 57,
 50,
 71,
 54,
 0,
 55,
 61,
 54,
 53,
 0,
 25,
 64,
 60,
 64,
 0,
 31,
 50,
 67,
 50,
 62,
 8,
 0,
 67,
 54,
 66,
 70,
 58,
 67,
 54,
 53,
 0,
 50,
 63,
 0,
 54,
 55,
 55,
 64,
 67,
 69,
 0,
 59,
 70,
 68,
 69,
 0,
 50,
 68,
 0,
 58,
 62,
 62,
 54,
 63,
 68,
 54,
 10,
 0,
 36,
 54,
 54,
 69,
 0,
 69,
 57,
 54,
 0,
 56,
 50,
 63,
 56,
 0,
 55,
 67,
 64,
 62,
 0,
 70,
 63,
 53,
 54,
 67,
 0,
 69,
 57,
 54,
 0,
 51,
 70,
 68,
 10,
 0,
 24,
 0,
 52,
 64,
 70,
 67,
 69,
 0,
 67,
 70,
 61,
 58,
 63,
 56,
 0,
 50,
 63,
 63,
 70,
 61,
 61,
 58,
 63,
 56,
 0,
 69,
 57,
 54,
 0,
 61,
 54,
 56,
 58,
 68,
 61,
 50,
 69,
 70,
 67,
 54,
 82,
 68,
 0,
 50,
 70,
 69,
 57,
 64,
 67,
 58,
 69,
 74,
 0,
 67,
 54,
 65,
 67,
 54,
 68,
 54,
 63,
 69,
 68,
 0,
 50,
 0,
 53,
 50,
 63,
 56,
 54,
 67,
 64,
 70,
 68,
 0,
 69,
 70,
 67,
 63,
 58,
 63,
 56,
 0,
 65,
 64,
 58,
 63,
 69,


In [100]:
X = np.array(X)
y = np.array(y)

In [101]:
X

array([[38, 63, 54, ..., 43, 58, 62],
       [63, 54,  0, ..., 58, 62, 54],
       [54,  0, 64, ..., 62, 54, 68],
       ...,
       [51, 50, 68, ..., 50, 68, 68],
       [50, 68, 54, ..., 68, 68, 58],
       [68, 54, 53, ..., 68, 58, 52]])

In [102]:
y

array([54, 68,  0, ..., 58, 52, 10])

In [103]:
# One-hot encode the target variable
y = to_categorical(y,num_classes=num_chars)

In [104]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [105]:
# Define model parameters
embedding_dim = 10

In [106]:
model = Sequential([
    Embedding(input_dim=num_chars,output_dim=embedding_dim,input_length=max_len),
    Bidirectional(LSTM(128,dropout=0.2,recurrent_dropout=0.2)),
    Dense(num_chars,activation='softmax')
    
])

In [107]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 40, 10)            850       
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               142336    
_________________________________________________________________
dense_2 (Dense)              (None, 85)                21845     
Total params: 165,031
Trainable params: 165,031
Non-trainable params: 0
_________________________________________________________________


In [108]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [109]:
early_stopping = EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [110]:
model.fit(X,y,batch_size=128,epochs=20,validation_split=0.2,callbacks=[early_stopping],verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fdd2c3558d0>

In [113]:
# Function to sample from predictions with temperature

def sample(preds,temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-10)/temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1,preds,1)
    return np.argmax(probas)

In [115]:
# Function to generate text
def generate_text(model, seed_text, num_chars_to_generate, temperature=0.5):
    if len(seed_text) < max_len:
        seed_text = seed_text.rjust(max_len)  # Pad with spaces if too short
    generated = seed_text
    for _ in range(num_chars_to_generate):
        # Prepare input sequence
        x_pred = np.zeros((1, max_len), dtype=np.int32)
        for t, char in enumerate(seed_text[-max_len:]):
            x_pred[0, t] = char_to_index.get(char, 0)  # Use 0 for unknown chars
        
        # Predict next character
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = index_to_char[next_index]
        
        # Update generated text and seed
        generated += next_char
        seed_text = seed_text[1:] + next_char
    return generated

# Generate text using the first 40 characters of the corpus as the seed
seed_text = corpus[:max_len]
print(f"\nSeed text: '{seed_text}'")
generated_text = generate_text(model, seed_text, num_chars_to_generate=1000, temperature=0.5)
print("\nGenerated Text:")
print(generated_text)


Seed text: 'One of the largest photo displays in Tim'

Generated Text:
One of the largest photo displays in Times states to sead and the presidents. Whe Stated and work the fricks and defures the porcent in a stute and the president Trump to is the conter the stare of the pourtand a lemection of the president Trump and the mone be new a polical as the reconal can tear goudd probles the vare the strees the polical the president the president the to say be one the president Trump with the recention and the string streats and the carsitions at a contenter his presidents. The Will with “tars a president in the proverce distle it the president the president Trump a buck a compunical the pacies for the rolical mone the is dare wall a ling the convered a new York the fights in the couted to warger ade and and planing and the cresident they and the says linger that stanger for most new York in a toush of a stape emerican reporic. A will pare that staring the Gear of the proting with pore an a