In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import json

In [None]:
# Define the path to the JSON file
json_file_path = '/content/drive/MyDrive/nytimes_train.json'

# Read the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)
#  first few article samples
num_samples_to_print = 5
for i, article in enumerate(data[:num_samples_to_print]):
    print(f"Article {i+1}: {article}")


len(data)
print(type(data))

Article 1: {'section': 'Theater', 'headline': "Before 'Moonlight' and 'The Walking Dead,' a Friendship Born in the Classroom", 'article_url': 'https://www.nytimes.com/2017/02/21/theater/danai-gurira-andre-holland-walking-dead.html', 'article': 'Danai Gurira and Andre Holland in a theater at New York University, where they met in the Tisch Graduate Acting Program.\n\nBehind every successful person are relationships that helped forge a path. But the stories of these friendships, collaborations, alliances, romances or rivalries often are lost in the glow of achievement. In this new feature, we explore a personal connection that made a difference in the lives of two artists. Andre Holland never thought much about writing his own monologues when he attended the Tisch graduate acting program at New York University. But one day, early in his first semester in 2003, he watched another African-American student, Danai Gurira, forgo the usual speeches by white characters and perform her own mater

In [None]:
data[1]['article']

'CRIP CAMP: A DISABILITY REVOLUTION (2020) Stream on Netflix. This documentary, the latest offering from Barack Obama and Michelle Obama\'s production company, draws a direct line between a Catskills summer camp and the American disability rights movement of the 1970s. Directed by Jim LeBrecht and Nicole Newnham, the film begins by focusing on Camp Jened, which was founded in the early 1950s and served as a community for campers with disabilities. But it eventually shifts focus to look at the adult lives of some of the camp\'s alumni, several of whom became prominent activists. In his review for The New York Times, Ben Kenigsberg wrote that the film "unfolds from a perspective of lived experience." Newnham and LeBrecht, he added, "deftly juggle a large cast of characters past and present, accomplishing the not-so-easy task of making all the personalities distinct."\n\nDARK PHOENIX (2019) 9 p.m. on HBO. In an interview with The Times last year, the actress Sophie Turner discussed the mo

# 1. Data Processing:

--------------------------------------------------------------------------------------------------------

In [None]:
# Extracting article bodies from your dataset
articles = [article['article'] for article in data]
abstracts = [article['abstract']for article in data]

In [None]:
#ensure the each abstract and article body with the same size .this ensures that all articles have abstracts
len(articles),len(abstracts)

(48988, 48988)

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(articles + abstracts)

# Convert text sequences to integer sequences
articles_seqs = tokenizer.texts_to_sequences(articles)
Abstracts_seqs = tokenizer.texts_to_sequences(abstracts)

# Pad sequences to ensure uniform length for both abstracts and articles
articles_seqs_padded = pad_sequences(articles_seqs, maxlen=15000, padding='post', truncating='post')
Abstracts_seqs_padded = pad_sequences(Abstracts_seqs, maxlen=250, padding='post', truncating='post')

for model architecture i followed the architecture of Seq2Seq with Attention,this architecture consists of an encoder-decoder framework, encoder processes the input sequence (article body), and the decoder generates the output sequence (abstract). The attention mechanism helps the decoder focus on relevant parts of the input sequence during generation.

# 2. Model Architecture

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention

# Define the model architecture
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
max_len=200
# Encoder
encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
attention_layer = Attention()
attention_out = attention_layer([decoder_outputs, encoder_outputs])
decoder_concat_input = tf.concat([decoder_outputs, attention_out], axis=-1)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


# 3. Compile Optimization and loss function


In [None]:

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


# 4. Model Training

In [None]:

# Define training parameters
batch_size = 32
epochs = 10

# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')

    # Shuffle the data
    indices = np.random.permutation(len(articles_seqs_padded))
    encoder_input_data = articles_seqs_padded[indices]
    decoder_input_data = Abstracts_seqs_padded[indices]
    decoder_target_data = np.roll(decoder_input_data, -1, axis=1)

    # Iterate over batches
    for i in range(0, len(encoder_input_data), batch_size):
        encoder_input_batch = encoder_input_data[i:i+batch_size]
        decoder_input_batch = decoder_input_data[i:i+batch_size]
        decoder_target_batch = decoder_target_data[i:i+batch_size]

        # Ensure correct input shape
        encoder_input_batch = encoder_input_batch[:, :200]  # Trim or pad the input to match the expected shape
        decoder_input_batch = decoder_input_batch[:, :200]  # Trim or pad the input to match the expected shape
        decoder_target_batch = decoder_target_batch[:, :200]  # Trim or pad the target to match the expected shape

        # Train the model on a batch of data
        model.train_on_batch([encoder_input_batch, decoder_input_batch], decoder_target_batch)

# Evaluate the model
loss = model.evaluate([encoder_input_data[:, :200], decoder_input_data[:, :200]], decoder_target_data[:, :200], verbose=0)
print(f' - Loss: {loss}')




Epoch 1/10


# Conclusion:

Due to the high computation, my laptob could not complete the task till the end as it stopped on training stage, runtime took more that 1 hour and then crashed, without training stepm it is difficult to go further to finish the whole approach.

the next steps should be:
  ### <small>5.Validation stage.</small>
* A separate validation dataset will be used to evaluate the model's performance during training.
* After each epoch, the model's performance on the validation dataset will be assessed using the evaluate method to compute the loss.


 ### <small>6.Adjustments and Tuning:.</small>
 * batch size, sequence length, and the number of epochs will be adjusted
 * Hyperparameter tuning and experimentation until get the optimal model

   ### <small>7.Evaluation stage.</small>
* try the model on the new dataset (test set)