## 1.1 - Getting Dataset

In [1]:
import nltk
import pandas as pd
nltk.download('reuters')
from nltk.corpus import reuters

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/umerriaz/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [2]:
# Get the list of fileids for training and testing sets

train_docs=[d for d in reuters.fileids() if d.startswith("train")]
test_docs=[d for d in reuters.fileids() if d.startswith("test")]

In [3]:
# Load the dataset
training_data = [reuters.raw(doc_id) for doc_id in train_docs]
training_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs]

testing_data = [reuters.raw(doc_id) for doc_id in test_docs]
testing_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs]

### Merging the training and testing data

In [4]:
Complete_training_data = training_data+testing_data

In [5]:
Complete_training_labels = training_labels + testing_labels

In [6]:
len(Complete_training_data)

10788

In [7]:
len(Complete_training_labels)

10788

In [8]:
df = pd.DataFrame(list(zip(Complete_training_data,Complete_training_labels)), columns =['Text', 'Label']) 
df

Unnamed: 0,Text,Label
0,BAHIA COCOA REVIEW\n Showers continued throug...,cocoa
1,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,acq
2,N.Z. TRADING BANK DEPOSIT GROWTH RISES SLIGHTL...,money-supply
3,NATIONAL AMUSEMENTS AGAIN UPS VIACOM &lt;VIA> ...,acq
4,ROGERS &lt;ROG> SEES 1ST QTR NET UP SIGNIFICAN...,earn
...,...,...
10783,N.Z.'S CHASE CORP MAKES OFFER FOR ENTREGROWTH\...,acq
10784,TOKYO DEALERS SEE DOLLAR POISED TO BREACH 140 ...,dlr
10785,JAPAN/INDIA CONFERENCE CUTS GULF WAR RISK CHAR...,ship
10786,SOVIET INDUSTRIAL GROWTH/TRADE SLOWER IN 1987\...,ipi


## 1.2 Dataset Preprocessing

In [9]:

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')


# Initialize stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Specify the column name containing the text data
column_name = 'Text'




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umerriaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/umerriaz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# Function to preprocess each row of the specified column
def preprocess_text(row):
    # Tokenize the text into individual words
    words = nltk.word_tokenize(row[column_name])
    
    # Remove stop words
    words = [word for word in words if word.lower() not in stop_words]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Stem words
    words = [stemmer.stem(word) for word in words]
    
    # Join the preprocessed words back into a string
    processed_text = ' '.join(words)
    
    # Update the row with the preprocessed text
    row[column_name] = processed_text
    
    return row


In [11]:
# Apply the preprocessing function to each row of the specified column
df = df.apply(preprocess_text, axis=1)

In [12]:
df

Unnamed: 0,Text,Label
0,bahia cocoa review shower continu throughout w...,cocoa
1,comput termin system & lt ; cpml > complet sal...,acq
2,n.z . trade bank deposit growth rise slightli ...,money-supply
3,nation amus up viacom & lt ; via > bid viacom ...,acq
4,roger & lt ; rog > see 1st qtr net significant...,earn
...,...,...
10783,n.z . 's chase corp make offer entregrowth cha...,acq
10784,tokyo dealer see dollar pois breach 140 yen to...,dlr
10785,japan/india confer cut gulf war risk charg jap...,ship
10786,soviet industri growth/trad slower 1987 soviet...,ipi


## 1.3 Splitting the dataset into training and testing

In [13]:
from sklearn.model_selection import train_test_split
X = df['Text'] 
y = df['Label']  

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Feature extraction

### Using word embedding 

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN,Dense,Embedding


In [15]:
tokenizer = Tokenizer()

In [16]:
tokenizer.fit_on_texts(X)

In [17]:
sequences = tokenizer.texts_to_sequences(X)
max_sequence_length = max([len(seq) for seq in sequences])

In [18]:
padded_sequences = pad_sequences(sequences)

In [19]:
vocab_size = len(tokenizer.word_index) + 1

In [20]:
# Define the Word Embedding model
embedding_dim = 10
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=padded_sequences.shape[1]),
    tf.keras.layers.Flatten()
])

In [21]:
word_embeddings = model.predict(padded_sequences)



In [22]:
# Convert labels to numerical format
import numpy as np
label_mapping = {label: i for i, label in enumerate(y.unique())}
numeric_labels = ([label_mapping[label] for label in y])

In [23]:
dff = pd.DataFrame({'Numeric_labels': numeric_labels})

In [24]:
concatenated_data = pd.concat([pd.DataFrame(word_embeddings), dff], axis=1)

In [25]:
concatenated_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22881,22882,22883,22884,22885,22886,22887,22888,22889,Numeric_labels
0,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,0.049302,0.028924,0.032746,0.030447,-0.016130,-0.020134,-0.022369,-0.006564,0.008197,0
1,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.039339,-0.039835,0.046602,0.019698,0.037573,-0.037152,-0.033291,-0.034618,0.006462,1
2,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.000351,0.004230,-0.012508,-0.048204,0.010063,-0.034915,0.043576,-0.019707,-0.030057,2
3,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.010336,-0.023971,-0.004670,0.036734,-0.021561,-0.017622,-0.023365,-0.028015,-0.004436,1
4,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.028957,0.049113,-0.037259,0.004704,0.034617,0.039967,0.034210,0.043840,0.036190,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10783,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.000532,-0.040034,-0.038340,0.019956,-0.020824,-0.045036,0.003691,0.039150,0.046967,1
10784,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.026717,0.022293,0.039936,-0.021248,-0.003175,0.028122,0.000609,0.042086,0.028770,10
10785,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.045825,-0.046434,0.003262,0.006246,0.012565,-0.045121,0.030795,-0.020941,-0.014373,28
10786,-0.030067,-0.021527,-0.036057,0.038567,-0.035065,-0.040354,-0.035469,-0.003351,0.046508,-0.005816,...,-0.011692,0.049926,0.037447,-0.024279,-0.034484,0.001343,0.033883,0.000656,0.004335,22


## 3. Model Implementation

## 3.1 Simple RNN

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to ensure equal length
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to numerical format
label_mapping = {label: i for i, label in enumerate(y.unique())}
numeric_labels = np.array([label_mapping[label] for label in y])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, numeric_labels, test_size=0.2, random_state=42)

# Define the RNN model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 10
hidden_units = 32

model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
model1.add(SimpleRNN(hidden_units))
model1.add(Dense(len(label_mapping), activation='softmax'))

# Compile the model
model1.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model1.fit(X_train, y_train, epochs=2, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

Epoch 1/2


## 3.2.  LSTM 

In [None]:
# Define the LSTM model
from tensorflow.keras.layers import Embedding, LSTM, Dense

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 10
hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(hidden_units))
model.add(Dense(len(label_mapping), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=2, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

## 4. Model Comparison

## 4.1 Comparing the performance

###### As we can observe, the LSTM model achieves higher accuracy compared to the RNN model, which can be attributed to several reasons.

1.LSTM models, as a specific type of Recurrent Neural Networks (RNNs), possess the capability to capture long-term   dependencies by incorporating an additional mechanism for retaining information over extended periods of time.

2. LSTMs have a larger parameter space compared to basic RNNs due to their more complex architecture. This expanded   capacity enables LSTMs to capture intricate patterns in the data, potentially leading to higher accuracy.

## 4.2 Plotting the graph

In [None]:
import matplotlib.pyplot as plt

# Train the RNN model and store the history
rnn_history = model1.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))

# Train the LSTM model and store the history
lstm_history = model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))

# Plot the accuracy curves
plt.figure(figsize=(12, 6))
plt.plot(rnn_history.history['accuracy'], label='RNN Training Accuracy')
plt.plot(rnn_history.history['val_accuracy'], label='RNN Validation Accuracy')
plt.plot(lstm_history.history['accuracy'], label='LSTM Training Accuracy')
plt.plot(lstm_history.history['val_accuracy'], label='LSTM Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot the loss curves
plt.figure(figsize=(12, 6))
plt.plot(rnn_history.history['loss'], label='RNN Training Loss')
plt.plot(rnn_history.history['val_loss'], label='RNN Validation Loss')
plt.plot(lstm_history.history['loss'], label='LSTM Training Loss')
plt.plot(lstm_history.history['val_loss'], label='LSTM Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()