In [14]:
import pandas as pd
import re 

df = pd.read_csv('processed_news_data.csv')
df = df.iloc[:, 1:]

In [15]:
df['processed_description']

0      uk prime minister sir keir starmer signal deci...
1      hour prime minister taken motorcade white hous...
2      men come forward bbc accusing former chief exe...
3      around pensioner england wale set lose winter ...
4      donald trump said mass deport migrant small oh...
                             ...                        
349    one british tv unusual soap plot twist action ...
350                                                  NaN
351                                                  NaN
352                                                  NaN
353                                                  NaN
Name: processed_description, Length: 354, dtype: object

In [16]:
data = df
data = data.rename(columns={'Description': 'description'})
data = data.iloc[:, 4:5]

In [17]:
data

Unnamed: 0,processed_description
0,uk prime minister sir keir starmer signal deci...
1,hour prime minister taken motorcade white hous...
2,men come forward bbc accusing former chief exe...
3,around pensioner england wale set lose winter ...
4,donald trump said mass deport migrant small oh...
...,...
349,one british tv unusual soap plot twist action ...
350,
351,
352,


In [18]:
data = data.rename(columns={'processed_description': 'description'})


In [19]:
data

Unnamed: 0,description
0,uk prime minister sir keir starmer signal deci...
1,hour prime minister taken motorcade white hous...
2,men come forward bbc accusing former chief exe...
3,around pensioner england wale set lose winter ...
4,donald trump said mass deport migrant small oh...
...,...
349,one british tv unusual soap plot twist action ...
350,
351,
352,


In [20]:
def clean_text_no_numbers(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)  # This removes all digits
    # Remove special characters and keep only alphabetic characters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = ' '.join(text.split())
    return text.lower()

In [23]:
data['description'].isnull().sum()

47

In [25]:
data = data.dropna()

In [26]:
data['description'] = data['description'].apply(clean_text_no_numbers)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['description'] = data['description'].apply(clean_text_no_numbers)


In [42]:
word_source_train = []
for i in data['description']:
    word_source_train.extend(i.split(' '))

print("all the words in the corpus ", len(word_source_train))

all the words in the corpus  135976


In [27]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load dataset
# data = pd.read_csv('your_dataset.csv')  # Change this to your file path

# Clean text (optional but recommended)
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace("\n", " ").replace("\r", " ").strip()  # Remove newlines
    return text

data['description'] = data['description'].apply(clean_text)

# Tokenize descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['description'])
vocab_size = len(tokenizer.word_index) + 1  # Plus one for padding token

# Convert descriptions to sequences
encoder_input_data = tokenizer.texts_to_sequences(data['description'])

# Find the maximum sequence length
max_len = max([len(seq) for seq in encoder_input_data])

# Pad sequences for consistent input length
encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_len, padding='post')

print(f"Input data shape: {encoder_input_data.shape}")


Input data shape: (307, 1167)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['description'] = data['description'].apply(clean_text)


In [28]:
encoder_input_data

array([[  13,  167,   32, ...,    0,    0,    0],
       [ 197,  167,   32, ...,    0,    0,    0],
       [ 239,   30,  573, ..., 1259, 1834,  217],
       ...,
       [1428, 2679, 2710, ...,    0,    0,    0],
       [ 232,    3, 9000, ...,    0,    0,    0],
       [   8,  289,  319, ...,    0,    0,    0]])

In [29]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

latent_dim = 256  # Dimensionality of the latent space

# Encoder
encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [30]:
# Prepare the decoder input data (shifted by one timestep)
decoder_input_data = np.zeros_like(encoder_input_data)
decoder_input_data[:, 1:] = encoder_input_data[:, :-1]  # Shift sequences

# Prepare the decoder target data (shifted by one timestep)
decoder_target_data = np.expand_dims(encoder_input_data, -1)  # Add an extra dimension


In [31]:
decoder_input_data

array([[   0,   13,  167, ...,    0,    0,    0],
       [   0,  197,  167, ...,    0,    0,    0],
       [   0,  239,   30, ...,   45, 1259, 1834],
       ...,
       [   0, 1428, 2679, ...,    0,    0,    0],
       [   0,  232,    3, ...,    0,    0,    0],
       [   0,    8,  289, ...,    0,    0,    0]])

In [43]:
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=4,
    epochs=10,
    validation_split=0.2
)


Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 3s/step - accuracy: 0.6183 - loss: 3.1510 - val_accuracy: 0.5813 - val_loss: 3.6829
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 2s/step - accuracy: 0.6438 - loss: 2.9420 - val_accuracy: 0.5818 - val_loss: 3.6918
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 1s/step - accuracy: 0.6336 - loss: 3.0184 - val_accuracy: 0.5823 - val_loss: 3.6992
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 2s/step - accuracy: 0.6360 - loss: 2.9814 - val_accuracy: 0.5838 - val_loss: 3.6964
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 2s/step - accuracy: 0.6187 - loss: 3.0911 - val_accuracy: 0.5840 - val_loss: 3.7025
Epoch 6/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 1s/step - accuracy: 0.6438 - loss: 2.8927 - val_accuracy: 0.5845 - val_loss: 3.7002
Epoch 7/10
[1m62/62[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x204e6c9b0b0>

In [44]:
# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_inference = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_inference, state_h_inference, state_c_inference = decoder_lstm_inference(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_outputs_inference = decoder_dense(decoder_outputs_inference)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs_inference, state_h_inference, state_c_inference])


In [45]:
def generate_summary(input_seq):
    # Get encoder states
    states_value = encoder_model.predict(input_seq)

    # Start with a "start" token
    target_seq = np.zeros((1, 1))
    summary = []

    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')
        if sampled_word == 'end' or len(summary) > max_len:
            break

        summary.append(sampled_word)

        # Update target sequence for next step
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(summary)

# Example: Generate summary for a new description
new_description = "Your new news description here"
new_seq = tokenizer.texts_to_sequences([new_description])
new_seq = pad_sequences(new_seq, maxlen=max_len, padding='post')
print(generate_summary(new_seq))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [46]:
# Example: Generate summary for a new description
new_description = '''The British economy is poised for strong growth into 2005, raising the possibility of an interest rate hike early in the new year, according to a study published Monday. 
 Yasser Arafat cannot be buried in Jerusalem, Israeli Prime Minister Ariel Sharon told his Cabinet, while affirming his pledge to let the ailing Palestinian leader return to the West Bank if he recovers. 
 Declaring Royal Dutch/Shell Group "an enemy of the Nigerian people," unions have called a Nov. 16 nationwide strike that they said would target oil exports in Africa's oil giant. 
 Sudan's government has expressed optimism on reaching a long-term political solution to crisis in its Darfur region after listening to a rebel group's proposals. 
 Militants in Afghanistan released a video showing three frightened foreign U.N. hostages pleading for their release, and threatened to kill them unless U.N. and British troops leave the country and Muslim prisoners are freed from U.S. jails. 
 Rugby fans hoping to watch the British and Irish Lions play in New Zealand next year began to stake their claims to tickets Monday under a ballot system. 
 Demonstrators burned incense and beat drums on Sunday at the ancient ruins of Teotihuacan in an effort to ward off the opening of a Wal-Mart-owned store nearby. 
 From the deserts of the south and west to the outskirts of Baghdad, Iraq is awash in weapons sites _ some large, others small; some guarded, others not. Even after the U.S. military secured some 400,000 tons of munitions, as many as 250,000 tons remain unaccounted for. 
 Arms experts say there are scores of weapons sites across Iraq. Key sites include: 
 The U.S. dollar was trading at 105.78 yen on the Tokyo foreign exchange market at 9 a.m. (0000 GMT) Monday, down 0.08 yen from late Friday.'''


In [47]:
import re
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Remove URLs
text = re.sub(r'http\S+', '', new_description)
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', new_description)
# Remove extra spaces
text = ' '.join(new_description.split())
text = text.lower()

In [48]:
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text)
text = ' '.join([word for word in tokens if word not in stop_words])


In [49]:
text

"british economy poised strong growth 2005 , raising possibility interest rate hike early new year , according study published monday . yasser arafat buried jerusalem , israeli prime minister ariel sharon told cabinet , affirming pledge let ailing palestinian leader return west bank recovers . declaring royal dutch/shell group `` enemy nigerian people , '' unions called nov. 16 nationwide strike said would target oil exports africa 's oil giant . sudan 's government expressed optimism reaching long-term political solution crisis darfur region listening rebel group 's proposals . militants afghanistan released video showing three frightened foreign u.n. hostages pleading release , threatened kill unless u.n. british troops leave country muslim prisoners freed u.s. jails . rugby fans hoping watch british irish lions play new zealand next year began stake claims tickets monday ballot system . demonstrators burned incense beat drums sunday ancient ruins teotihuacan effort ward opening wal-

In [None]:
new_seq = tokenizer.texts_to_sequences([new_description])
new_seq = pad_sequences(new_seq, maxlen=max_len, padding='post')
print(generate_summary(new_seq))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16