<a href="https://colab.research.google.com/github/RodrigoRoman/ml_ai_portafolio/blob/main/long_short_term_memory/LSTM_with_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>LSTM</h1>

In [1]:
import numpy as np
import random

In [2]:
from google.colab import drive

In [3]:
news_s_path = "/content/drive/MyDrive/newsSpace"

<h2>Data preprocessing functions</h2>

In [4]:
import re
def is_url(s):
    # A simple regex to check for a basic URL structure
    return re.match(r'https?://', s) is not None
def tokenize_article(line):
  url_index = next((i for i, item in enumerate(line) if is_url(item)), None)
  if url_index is not None:
    return re.split(r'[ ,.;:!?()]+', ' '.join(line[url_index+1:]))
  return None


def process_file(filepath, num_articles):
  articles = []
  vocabulary = set()
  pattern = re.compile(r'[ ,.;:!?()]+')
  word_pattern = re.compile(r"\b[A-Za-z]+'?[A-Za-z]*(?=\s|\b)")
  try:
    with open(filepath, encoding='ISO-8859-1') as file:
      data = file.read()
      pattern = re.compile(r"\((Reuters|AP)\)[\t\n]+(.*?)[\t\n]+\d+[\t\n]+[0-9]{4}-[0-9]{2}-[0-9]{2}", re.DOTALL)
      raw_articles = pattern.findall(data)
      print("amount of articles")
      print(len(raw_articles))

      for article in raw_articles:
        if len(articles) < num_articles:
          article_text = article[1].strip()
          # Cleaning and processing the article text
          words = word_pattern.findall(article_text.lower())
          # cleaned_article = ' '.join(words)
          articles.append(words)
          # Update vocabulary
          vocabulary.update(words)
  except IOError:
    print("Error opening or reading the file")
    return [], set()
  return articles, vocabulary

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<h2>DataLoading</h2>

In [6]:
num_articles = 10
news_s_path = "/content/drive/MyDrive/newsSpace"

data_articles, vocabulary = process_file(news_s_path, num_articles)
print(data_articles)

amount of articles
57469
[['none', 'business', 'reuters', 'wall', "street's", 'long', 'playing', 'drama', 'waiting', 'for', 'google', 'is', 'about', 'to', 'reach', 'its', 'final', 'act', 'but', 'its', 'stock', 'market', 'debut', 'is', 'ending', 'up', 'as', 'more', 'of', 'a', 'nostalgia', 'event', 'than', 'the', 'catalyst', 'for', 'a', 'new', 'era'], ['none', 'business', 'reuters', 'short', 'sellers', 'wall', "street's", 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], ['none', 'business', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'group', 'which', 'has', 'a', 'reputation', 'for', 'making', 'well', 'timed', 'and', 'occasionally', 'controversial', 'plays', 'in', 'the', 'defense', 'industry', 'has', 'quietly', 'placed', 'its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market'], ['none', 'business', 'reuters', 'soaring', 'crude', 'prices', 'plus', 'worries', 'about', 'the', 'economy', 'and', 'the', 'outlook', 'for', 'earnings', 'are', 'exp

<h3>Split data into train and test</h3>
<p>Split data into training and testing sets, and further into input and target pairs where the target is the next word</p>

In [7]:

# Preprocess articles to create input-target pairs
def create_input_target(articles):
  article_targets = []
  for article in articles:
    target_article = []
    for i in range(len(article) - 1):
      target_word = article[i + 1]
      target_article.append(target_word)
    article_targets.append(target_article)
  return article_targets

# Split data into training and test sets
def split_data(data, test_percentage):
  split_point = int(len(data) * test_percentage)
  test_set = data[:split_point]
  training_set = data[split_point:]
  return training_set, test_set


# Example usage
test_percentage = 0.2

# Create input-target pairs
article_targets  = create_input_target(data_articles)
print("Input target pairs")
print(article_targets)

# Vocabulary
word_to_idx = {ch:i for (i,ch) in enumerate(list(vocabulary))}
idx_to_word = {i:ch for (i,ch) in enumerate(list(vocabulary))}

# Take input-target as x and y
x_train_words, x_test_words = split_data(data_articles,test_percentage)
y_train_words, y_test_words = split_data(article_targets,test_percentage)

# Change the data to their index versions
x_train = [[word_to_idx[word] for word in article if word in word_to_idx] for article in x_train_words]
y_train = [[word_to_idx[word] for word in article if word in word_to_idx] for article in y_train_words]
x_test = [[word_to_idx[word] for word in article if word in word_to_idx] for article in x_test_words]
y_test = [[word_to_idx[word] for word in article if word in word_to_idx] for article in y_test_words]

Input target pairs
[['business', 'reuters', 'wall', "street's", 'long', 'playing', 'drama', 'waiting', 'for', 'google', 'is', 'about', 'to', 'reach', 'its', 'final', 'act', 'but', 'its', 'stock', 'market', 'debut', 'is', 'ending', 'up', 'as', 'more', 'of', 'a', 'nostalgia', 'event', 'than', 'the', 'catalyst', 'for', 'a', 'new', 'era'], ['business', 'reuters', 'short', 'sellers', 'wall', "street's", 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], ['business', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'group', 'which', 'has', 'a', 'reputation', 'for', 'making', 'well', 'timed', 'and', 'occasionally', 'controversial', 'plays', 'in', 'the', 'defense', 'industry', 'has', 'quietly', 'placed', 'its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market'], ['business', 'reuters', 'soaring', 'crude', 'prices', 'plus', 'worries', 'about', 'the', 'economy', 'and', 'the', 'outlook', 'for', 'earnings', 'are', 'expected', 'to', 'hang', 'over', 'the', '

<h2>LSTM</h2>

In [10]:
import numpy as np
def sigmoid(x):
    x_safe = x + 1e-12
    f = 1 / (1 + np.exp(-x_safe))
    return f * (1 - f)

class LSTM:
    def xavier_init(self,fan_in, fan_out):
      limit = np.sqrt(6 / (fan_in + fan_out))
      return np.random.uniform(-limit, limit, (fan_in, fan_out))

    def __init__(self, hidden_size, vocab_size, learning_rate):
      self.hidden_size = hidden_size
      self.vocab_size = vocab_size
      self.learning_rate = learning_rate
      # Initialize LSTM parameters
      input_size = hidden_size + vocab_size

      #Forget gate
      self.Wf = self.xavier_init(hidden_size, input_size)
      self.Wi = self.xavier_init(hidden_size, input_size)
      self.Wo = self.xavier_init(hidden_size, input_size)
      self.Wc = self.xavier_init(hidden_size, input_size)
      self.Wy = self.xavier_init(vocab_size, hidden_size)
      self.bf = np.zeros((hidden_size, 1))
      self.bi = np.zeros((hidden_size, 1))
      self.bo = np.zeros((hidden_size, 1))
      self.bc = np.zeros((hidden_size, 1))
      self.by = np.zeros((vocab_size, 1))

    def softmax(self, x):
      e_x = np.exp(x - np.max(x))
      return e_x / e_x.sum(axis=0)

    def cross_entropy(self, probs, targets):
      loss = 0
      epsilon = 1e-9
      for t in range(len(targets)):
        prob = max(probs[t][targets[t]], epsilon)
        loss += -np.log(prob)
      return loss
    def forward(self, inputs, h_prev, c_prev):
      caches = []
      h_next, c_next = h_prev, c_prev
      for t in range(len(inputs)):
        z = np.row_stack((h_next, inputs[t]))
        forget_gate = sigmoid(np.dot(self.Wf, z) + self.bf)
        input_gate = sigmoid(np.dot(self.Wi, z) + self.bi)
        output_gate = sigmoid(np.dot(self.Wo, z) + self.bo)
        c_bar = np.tanh(np.dot(self.Wc, z) + self.bc)
        c_next = forget_gate * c_next + input_gate * c_bar
        h_next = output_gate * np.tanh(c_next)
        y = np.dot(self.Wy, h_next) + self.by
        p = self.softmax(y)
        caches.append((h_next, c_next, forget_gate, input_gate, output_gate, c_bar, z, y, p,c_prev))
      return caches

    def backward(self, caches, targets):
      # Initialize gradients
      dWf, dWi, dWo, dWc, dWy = np.zeros_like(self.Wf), np.zeros_like(self.Wi), np.zeros_like(self.Wo), np.zeros_like(self.Wc), np.zeros_like(self.Wy)
      dbf, dbi, dbo, dbc, dby = np.zeros_like(self.bf), np.zeros_like(self.bi), np.zeros_like(self.bo), np.zeros_like(self.bc), np.zeros_like(self.by)
      dh_next, dc_next = np.zeros_like(caches[0][0]), np.zeros_like(caches[0][1])
      # Backpropagation through time
      for t in reversed(range(len(targets))):
        dh_next, dc_next, dWf_t, dWi_t, dWo_t, dWc_t, dbf_t, dbi_t, dbo_t, dbc_t, dWy_t, dby_t = self.lstm_step_backward(dh_next, dc_next, caches[t], targets[t])
        dWf += dWf_t
        dWi += dWi_t
        dWo += dWo_t
        dWc += dWc_t
        dbf += dbf_t
        dbi += dbi_t
        dbo += dbo_t
        dbc += dbc_t
        dWy += dWy_t
        dby += dby_t
      return dWf, dWi, dWo, dWc, dWy, dbf, dbi, dbo, dbc, dby

    def lstm_step_backward(self, dh_next, dc_next, cache, target):
        # Unpack cache
        h_next, c_next, f, i, o, c_bar, z, y, p,c_prev = cache

        # Gradients of loss with respect to y
        dy = np.copy(p)
        dy[target] -= 1

        # Gradients with respect to Wy and by
        dWy = np.dot(dy, h_next.T)
        dby = dy

        # Gradients with respect to h_next
        dh = np.dot(self.Wy.T, dy) + dh_next

        # Gradients with respect to o
        do = dh * np.tanh(c_next)
        do = do * o * (1 - o)

        # Gradients with respect to c_next
        dc = dh * o * (1 - np.tanh(c_next)**2) + dc_next

        # Gradients with respect to i
        di = dc * c_bar
        di = di * i * (1 - i)

        # Gradients with respect to c_bar
        dc_bar = dc * i
        dc_bar = dc_bar * (1 - c_bar**2)

        # Gradients with respect to f
        df = dc * c_prev
        df = df * f * (1 - f)

        # Gradients with respect to z
        dz = (np.dot(self.Wf.T, df)
              + np.dot(self.Wi.T, di)
              + np.dot(self.Wo.T, do)
              + np.dot(self.Wc.T, dc_bar))

        # Gradients with respect to the weights and biases
        dWf = np.dot(df, z.T)
        dWi = np.dot(di, z.T)
        dWo = np.dot(do, z.T)
        dWc = np.dot(dc_bar, z.T)
        dbf = df
        dbi = di
        dbo = do
        dbc = dc_bar

        # Compute gradients with respect to the previous hidden state and cell state
        dh_prev = dz[:self.hidden_size, :]
        dc_prev = f * dc

        return dh_prev, dc_prev, dWf, dWi, dWo, dWc, dbf, dbi, dbo, dbc, dWy, dby

    def update_parameters(self, dWf, dWi, dWo, dWc, dWy, dbf, dbi, dbo, dbc, dby):
        # Update the weights and biases using the gradients and the learning rate
        self.Wf -= self.learning_rate * dWf
        self.Wi -= self.learning_rate * dWi
        self.Wo -= self.learning_rate * dWo
        self.Wc -= self.learning_rate * dWc
        self.Wy -= self.learning_rate * dWy
        self.bf -= self.learning_rate * dbf
        self.bi -= self.learning_rate * dbi
        self.bo -= self.learning_rate * dbo
        self.bc -= self.learning_rate * dbc
        self.by -= self.learning_rate * dby

    def train(self, x_train, y_train, epochs):
      print("inside train")
      print(len(x_train))
      for epoch in range(epochs):
        print("Inside epoch")
        h_prev = np.zeros((self.hidden_size, 1))
        c_prev = np.zeros((self.hidden_size, 1))
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0
        rand_print = random.randint(1, len(x_train)-1)

        for batch_idx, inputs in enumerate(x_train):
          print("inside batch")
          # Convert inputs to one-hot encoded vectors
          inputs_one_hot = [np.eye(self.vocab_size)[:, x].reshape(-1, 1) for x in inputs]

          # Forward pass
          caches = self.forward(inputs_one_hot, h_prev, c_prev)

          # Backward pass
          dWf, dWi, dWo, dWc, dWy, dbf, dbi, dbo, dbc, dby = self.backward(caches, y_train[batch_idx])

          # Update parameters
          self.update_parameters(dWf, dWi, dWo, dWc, dWy, dbf, dbi, dbo, dbc, dby)

          # Update the hidden and cell states
          h_prev, c_prev = caches[-1][0], caches[-1][1]

          # Compute predictions and accuracy
          predictions = [min(np.argmax(cache[-1]), len(vocabulary) - 1) for cache in caches]
          targets = y_train[batch_idx]
          predictions = predictions[:len(targets)]
          for t, prediction in enumerate(predictions):
            correct_predictions += (prediction == targets[t])
            total_predictions += 1
          # Compute and print the loss for monitoring
          loss = self.cross_entropy([cache[-1] for cache in caches], targets)
          total_loss += loss
          if batch_idx == rand_print:
            print("Input sentence:")
            print([idx_to_word[i] for i in inputs])
            print("Predicted:")
            print([idx_to_word[prediction] for prediction in predictions])
            print("Amount of correct predictions")
            print(correct_predictions)
            accuracy = correct_predictions / total_predictions
            print("Accuracy -> ", accuracy)
            print("Loss -> ", loss)
        # Print epoch statistics
        epoch_loss = total_loss / len(x_train)
        epoch_accuracy = correct_predictions / total_predictions
        print(f'Epoch {epoch}, Loss: {epoch_loss}, Accuracy: {epoch_accuracy}')

<h2>Training process</h2>

In [11]:
epochs = 80

vocabulary_size = len(vocabulary)
hidden_size = 500
lstm = LSTM(hidden_size=hidden_size, vocab_size=vocabulary_size,learning_rate=0.001)
lstm.train(x_train,y_train,epochs)

inside train
8
Inside epoch
inside batch
lengths --
predictions
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
targets
[5, 182, 49, 102, 31, 119, 74, 206, 10, 193, 95, 62, 103, 176, 59, 194, 154, 11, 195, 151, 73, 0, 152, 10, 47, 20, 70, 57, 199, 113, 143, 135, 73, 187]
vocabulary length
216
inside batch
lengths --
predictions
[215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215]
targets
[5, 182, 215, 1, 7, 18, 92, 25, 73, 66, 194, 73, 141, 62, 144, 117, 114, 191, 101, 99, 73, 165, 187, 96, 211, 21, 73, 163, 135, 73, 204, 155]
vocabulary length
216
inside batch
lengths --
predictions
[215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215]
targets
[5, 182, 12, 112, 17, 85, 32, 197, 82, 73, 174, 115, 151, 192, 87, 13, 120, 210, 193, 1

KeyboardInterrupt: ignored