In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [7]:
data = pd.read_csv('ArticlesMarch2018.csv')


# Info about dataset
print("First few rows of the dataset:")
print(data.info())



First few rows of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1385 entries, 0 to 1384
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   articleID         1385 non-null   object
 1   byline            1385 non-null   object
 2   documentType      1385 non-null   object
 3   headline          1385 non-null   object
 4   keywords          1385 non-null   object
 5   multimedia        1385 non-null   int64 
 6   newDesk           1385 non-null   object
 7   printPage         1385 non-null   int64 
 8   pubDate           1385 non-null   object
 9   sectionName       1385 non-null   object
 10  snippet           1385 non-null   object
 11  source            1385 non-null   object
 12  typeOfMaterial    1385 non-null   object
 13  webURL            1385 non-null   object
 14  articleWordCount  1385 non-null   int64 
dtypes: int64(3), object(12)
memory usage: 162.4+ KB
None


In [8]:




# Select the relevant columns: headline and Encode the labels (sectionName) to numerical values and  Tokenize and pad the sequences for headlines
texts = data['headline'].astype(str).values
labels = data['sectionName'].astype(str).values


label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=100)


X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=100),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")




Epoch 1/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 65ms/step - accuracy: 0.3311 - loss: 3.2332 - val_accuracy: 0.6968 - val_loss: 1.8935
Epoch 2/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.6657 - loss: 1.8675 - val_accuracy: 0.6823 - val_loss: 2.4469
Epoch 3/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - accuracy: 0.6763 - loss: 1.6424 - val_accuracy: 0.6968 - val_loss: 1.5157
Epoch 4/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 66ms/step - accuracy: 0.6693 - loss: 1.3510 - val_accuracy: 0.6968 - val_loss: 1.4663
Epoch 5/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 62ms/step - accuracy: 0.7031 - loss: 1.1637 - val_accuracy: 0.6895 - val_loss: 1.4692
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6979 - loss: 1.4596
Test Accuracy: 0.69


In [9]:
# Build a stacked RNN model
stacked_rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=100),
    tf.keras.layers.SimpleRNN(64, return_sequences=True),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the stacked RNN model
stacked_rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_rnn_history = stacked_rnn_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Evaluate the model
stacked_rnn_loss, stacked_rnn_accuracy = stacked_rnn_model.evaluate(X_test, y_test)
print(f"Stacked RNN Test Accuracy: {stacked_rnn_accuracy:.2f}")


Epoch 1/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 277ms/step - accuracy: 0.4450 - loss: 2.9968 - val_accuracy: 0.6968 - val_loss: 1.7596
Epoch 2/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 167ms/step - accuracy: 0.6631 - loss: 1.8139 - val_accuracy: 0.6968 - val_loss: 1.5456
Epoch 3/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 207ms/step - accuracy: 0.6655 - loss: 1.6057 - val_accuracy: 0.6968 - val_loss: 1.5105
Epoch 4/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 128ms/step - accuracy: 0.6887 - loss: 1.4546 - val_accuracy: 0.6968 - val_loss: 1.4666
Epoch 5/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 91ms/step - accuracy: 0.6948 - loss: 1.2740 - val_accuracy: 0.6679 - val_loss: 1.5412
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.6787 - loss: 1.4694
Stacked RNN Test Accuracy: 0.67


In [10]:
# Build a bi-directional RNN model
bidirectional_rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=100),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(64)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the bi-directional RNN model
bidirectional_rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
bidirectional_rnn_history = bidirectional_rnn_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Evaluate the model
bidirectional_rnn_loss, bidirectional_rnn_accuracy = bidirectional_rnn_model.evaluate(X_test, y_test)
print(f"Bi-Directional RNN Test Accuracy: {bidirectional_rnn_accuracy:.2f}")


Epoch 1/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 129ms/step - accuracy: 0.5882 - loss: 2.4914 - val_accuracy: 0.6968 - val_loss: 1.5372
Epoch 2/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 104ms/step - accuracy: 0.6727 - loss: 1.5492 - val_accuracy: 0.6968 - val_loss: 1.4996
Epoch 3/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 77ms/step - accuracy: 0.6771 - loss: 1.4061 - val_accuracy: 0.6968 - val_loss: 1.4754
Epoch 4/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step - accuracy: 0.6918 - loss: 1.2137 - val_accuracy: 0.6787 - val_loss: 1.4877
Epoch 5/5
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 90ms/step - accuracy: 0.7639 - loss: 0.9574 - val_accuracy: 0.6534 - val_loss: 1.5512
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.6693 - loss: 1.5175
Bi-Directional RNN Test Accuracy: 0.65


In [None]:
# New headlines for prediction
new_headlines = [
    "Trump Proclaims Tariffs On Steel and Aluminum, And Stocks Sag in Reply",
    "U.S. military prepares for North Korean threat",
    "China's government extends term limits for president",
    "Getting a Taste of a Difficult Year"
]

# Tokenize and pad the new headlines
new_sequences = tokenizer.texts_to_sequences(new_headlines)
new_padded_sequences = pad_sequences(new_sequences, maxlen=100)

# Make predictions with the basic RNN model
predictions_basic_rnn = model.predict(new_padded_sequences)
predicted_classes_basic_rnn = np.argmax(predictions_basic_rnn, axis=1)

# Decode the predicted classes to section names
predicted_section_names_basic_rnn = label_encoder.inverse_transform(predicted_classes_basic_rnn)

# Print predictions
for i, headline in enumerate(new_headlines):
    print(f"Headline: '{headline}'")
    print(f"Predicted Section (Basic RNN): {predicted_section_names_basic_rnn[i]}")
    print("----")





