Importing my libraries

In [69]:
#!python -m spacy download en_core_web_md
#!python -m spacy download fr_core_news_md
#!pip install cupy

import tarfile
import numpy as np
import spacy
import cupy as cp
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

#Creating helper functions to process the data

In [None]:


def load_data(file_path):
    with open(file_path, encoding='utf-8') as file:
        lines = [line.strip().lower() for line in file if line.strip()]
    return lines

def preprocess(lang_sentences, percentages, lang_models, device="CPU"):
    # in this preprocess function the data is added such that
    # there is a small randomness added to it for training in the GANS
    # i dont know why i added this but to test it might work like a bias value
    data = []
    if device == "GPU":
      for sentence_0, sentence_1 in zip(*lang_sentences):
          embedded_0 = lang_models[0](sentence_0).vector
          embedded_1 = lang_models[1](sentence_1).vector
          max_len = max(len(embedded_0), len(embedded_1))
          # this is the gpu based code
          pad_embedded_0 = cp.pad(cp.asarray(embedded_0), (0, max_len - len(embedded_0)), 'constant')
          pad_embedded_1 = cp.pad(cp.asarray(embedded_1), (0, max_len - len(embedded_1)), 'constant')
          random_mat = cp.random.rand(*embedded_0.shape)
          full = pad_embedded_0 * percentages[0] + pad_embedded_1 * percentages[1] + random_mat
          data.append(full)
      return cp.array(data)
    else:
      # this is the cpu based code
      for sentence_0, sentence_1 in zip(*lang_sentences):
          embedded_0 = lang_models[0](sentence_0).vector
          embedded_1 = lang_models[1](sentence_1).vector
          max_len = max(len(embedded_0), len(embedded_1))
          # this is the cpu based code
          pad_embedded_0 = np.pad(embedded_0, (0, max_len - len(embedded_0)), 'constant')
          pad_embedded_1 = np.pad(embedded_1, (0, max_len - len(embedded_1)), 'constant')
          random_mat = np.random.rand(*embedded_0.shape)
          full = pad_embedded_0 * percentages[0] + pad_embedded_1 * percentages[1] + random_mat
          data.append(full)
      return np.array(data)

#PreProccessing and creating data

In [None]:

# Load English and French data
english_sentences = load_data('europarl-v7.fr-en.en')
french_sentences = load_data('europarl-v7.fr-en.fr')
print(len(english_sentences))
# load the spacy word embeddings for french and english
spacy.require_gpu()
spacy_embedding_en = spacy.load('en_core_web_md')
spacy_embedding_fr = spacy.load('fr_core_news_md')

# **mish mash** with 0.5 and 0.5 percentage points
mashed_sentences = preprocess([english_sentences, french_sentences],
                          [0.5, 0.5],
                          [spacy_embedding_en, spacy_embedding_fr],
                          "GPU")
np.save('mashed_sentences_fr_en_50_50.npy', mashed_sentences) # saving data so doesnt haev to be loaded again
# printing to see what the data looks like
print(english_sentences[0])
print(french_sentences[0])
print(mashed_sentences[0])

392744


KeyboardInterrupt: 

#Here after processing the data it can be loaded with pytorch data loader object

In [129]:
from torch.utils.data import Dataset, DataLoader

class MashedDataset(Dataset):
  def __init__(self, data):
        self.data = data

  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):
      return self.data[idx]

mashed_sentences = np.load('mashed_sentences_fr_en_50_50_random_bias.npy')
print(mashed_sentences.shape)

mashed_sentences = mashed_sentences[:10000]
# only using first 100
print(mashed_sentences.shape)

mashed_sentences_dataset = MashedDataset(torch.from_numpy(mashed_sentences))
# make sure to from numpy it

batch_size = 32
mashed_sentences_data_loader = DataLoader(mashed_sentences_dataset,
                                   batch_size=batch_size,
                                   shuffle=True,
                                    drop_last=True)

(36172, 300)
(10000, 300)


#Here is the onehot outputs for characters and their encodings

In [119]:
one_hot_characters = ['a', 'à','â', 'æ', 'b', 'c', 'ç', 'd', 'e', 'é', 'è', 'ê',
                      'ë', 'œ', 'f', 'g', 'h', 'i', 'î', 'ï', 'j', 'k', 'l',
                      'm', 'n', 'o', 'ô', 'p', 'q', 'r', 's', 't', 'u', 'ù','û',
                      'ü', 'v', 'w', 'x', 'y', 'ÿ' 'z', "'", ' ']
                      # space character and ' included
# this is both english and french characters discluding the overlap
# capitals are **banned** and arent used
# helper dictionaries for conversions
char_to_index = {char: index for index, char in enumerate(one_hot_characters)}
index_to_char = {index: char for index, char in enumerate(one_hot_characters)}


#Generator Architecture

The way this works is it will take in some length vector and then from it it will create the one hot matrix which represents the generated mish mashed sentence


In [120]:
class Generator(nn.Module):
    def __init__(self, input_size=100, seq_length=60):
        super(Generator, self).__init__()
        self.seq_length = seq_length
        self.gru = nn.GRU(input_size, 256, num_layers=1, batch_first=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(256, len(one_hot_characters))
        self.softmax = nn.Softmax(dim=2)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.GRU):
                nn.init.xavier_uniform_(m.weight_ih_l0)
                nn.init.xavier_uniform_(m.weight_hh_l0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, x, temperature=1.0):
    # Repeat the input noise vector seq_length times to create a sequence
      x = x.repeat(1, self.seq_length, 1)
      out, _ = self.gru(x)  # Only take the output, ignore the hidden state
      out = self.relu(out)
      out = self.linear(out)
      out = out / temperature  # Apply the temperature parameter
      out = self.softmax(out)
      return out


#Call the generator to see what it outputs untrained

In [122]:
def generate_sequence(generator, noise):
    # Initialize an empty list to store the sequences
    sequences = []
    # Iterate over the batch dimension of the noise tensor
    for i in range(noise.size(0)):
        # Generate a sequence of one-hot vectors for each noise vector
        one_hot_sequence = generator(noise[i].unsqueeze(0).to('cuda'), 1)
        # Convert the one-hot vectors to character indices
        char_indices = torch.argmax(one_hot_sequence, dim=2)
        # Convert the character indices to characters
        sequence = ''.join(index_to_char[index.item()] for index in char_indices[0])
        sequences.append(sequence)
    return sequences

# this is just for testing not important
noise = torch.randn(1, 100)
generator = Generator().to('cuda')
sequence = generate_sequence(generator, noise)
print(sequence)

# ill embed this into english and then french and combine it to see what should
# happen from it
spacy.require_gpu()
spacy_embedding_en = spacy.load('en_core_web_md')
spacy_embedding_fr = spacy.load('fr_core_news_md')

def sequence_to_mash_embed(sequences, DEVICE="cuda"):
    # Initialize an empty list to store the embeddings
    mashed_embeddings = []

    # Use the pipe method to process the sequences as a stream
    for doc_fr, doc_en in zip(spacy_embedding_fr.pipe(sequences), spacy_embedding_en.pipe(sequences)):
        sequence_fr_embedding = doc_fr.vector
        sequence_en_embedding = doc_en.vector
        mashed_embedding = sequence_fr_embedding * 0.5 + sequence_en_embedding * 0.5
        # Add an extra dimension to make it 2D
        mashed_embedding_2d = np.expand_dims(mashed_embedding, axis=0)
        mashed_embeddings.append(mashed_embedding_2d)

    # Convert the list of embeddings to a 3D tensor
    mashed_embeddings_tensor = torch.stack([torch.tensor(embedding).float().to(DEVICE) for embedding in mashed_embeddings])
    return mashed_embeddings_tensor

mashed_embedding = sequence_to_mash_embed(sequence, "cpu")
print(mashed_embedding) # this might work
# otherwise some other embedding scheme needs to be defined for the mashed language


['xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx']
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0.,

#Discriminator Architecture

This has to figure out if something is real or fake

In [112]:
class Discriminator(nn.Module):
    def __init__(self, input_size=300, hidden_size=128):
        super(Discriminator, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers=1, batch_first=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru(x)  # Only take the output, ignore the hidden state
        out = self.relu(out)
        out = self.linear(out[:, -1, :])  # Only take the last output of the sequence
        out = self.sigmoid(out)
        return out

#Testing the untrained discriminator on the previous embeddings

In [123]:
# Convert the numpy array to a PyTorch tensor

# Add an extra dimension for the batch size and sequence length if necessary
if len(mashed_embedding.shape) == 1:
    mashed_embedding = mashed_embedding.view(1, 1, -1)

print(mashed_embedding.shape)
# Instantiate the discriminator
discriminator = Discriminator(input_size=mashed_embedding.shape[-1])

# Pass the embeddings through the discriminator
prob = discriminator(mashed_embedding)

print(prob.item())


torch.Size([1, 1, 300])
0.49309638142585754


#Training Setup

In [130]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)
D = Discriminator().to(DEVICE)
G = Generator(seq_length=32).to(DEVICE)
# shoving hopefully everything to the gpu

max_epoch = 50 # going for 50 epochs
step = 0 # step through the data
n_noise = 100 # size of noise vector

criterion = nn.BCELoss()
D_opt = torch.optim.Adam(D.parameters(), lr=10e-4, betas=(0.5, 0.999))
G_opt = torch.optim.Adam(G.parameters(), lr=10e-4, betas=(0.5, 0.999))
# optimizers for both the discriminator and generator alongside a
# binary cross entropy loss

# We will denote real images as 1s and fake images as 0s
# This is why we needed to drop the last batch of the data loader
D_labels = torch.ones([batch_size, 1]).to(DEVICE) # Discriminator label: real
D_fakes = torch.zeros([batch_size, 1]).to(DEVICE) # Discriminator Label: fake

cuda


#Training Loop

In [132]:
import matplotlib.pyplot as plt
import time
# import pyplot to plot images
from google.colab import drive
drive.mount('/content/gdrive')

start_time = time.time()
for epoch in range(max_epoch):
    for idx, word_embeddings in enumerate(mashed_sentences_data_loader):
        # Training Discriminator
        x = word_embeddings.float().to(DEVICE)
        # rehsaping x to have the x dimension in it for the vectores
        x = x.view(batch_size, 1, 300)
        x_outputs = D(x) # input includes labels
        D_x_loss = criterion(x_outputs, D_labels) # Discriminator loss for real images

        z = torch.randn(batch_size, n_noise).to(DEVICE)

        # the generator outputs a sequence that sequence must then be converted into
        # embeddings that are passsed to the dsicriminator
         # the generator and the randomness to make a sequence
        sequences = generate_sequence(G, z)
        z_outputs = D(sequence_to_mash_embed(sequences, DEVICE)) # input to both generator and discriminator includes labels
        D_z_loss = criterion(z_outputs, D_fakes) # Discriminator loss for fake images
        D_loss = D_x_loss + D_z_loss # Total Discriminator loss

        D.zero_grad()
        D_loss.backward()
        D_opt.step()
        # updating the discriminator model

        # Training Generator
        z = torch.randn(batch_size, n_noise).to(DEVICE) # creating the random vector alongside the batch proper
        train_sequences = generate_sequence(G, z)
        z_outputs = D(sequence_to_mash_embed(train_sequences, DEVICE))
        G_loss = -1 * criterion(z_outputs, D_fakes) # Generator loss is negative disciminator loss

        G.zero_grad()
        G_loss.backward()
        G_opt.step()
        # updating the generator model

        if step % 500 == 0:
            print('Epoch: {}/{}, Step: {}, D Loss: {}, G Loss: {} time: '.format(epoch, max_epoch, step, D_loss.item(), G_loss.item(), time.time() - start_time))
            # done to view teh loss
        step += 1

    if epoch+1 in [1, 5, 10, 15, 20, 25, 30, 50]:
      # if in the 1st (done for making sure everything is good)
      # or the 10th or 30th or 50th epoch then display what the
      # generator has so far
      model_save_name = 'discriminator.pt'
      path = F"/content/gdrive/My Drive/{model_save_name}"
      torch.save(D.state_dict(), path)

      model_save_name = 'generator.pt'
      path = F"/content/gdrive/My Drive/{model_save_name}"
      torch.save(G.state_dict(), path)

      print(f"on epoch {epoch + 1}")
      noise = torch.randn(1, 100).to(DEVICE)
      G.eval()  # eval mode
      sequences = generate_sequence(generator, noise)
      print(sequences)
      # show the plot from get sample images
      G.train()
      # back to trianing the genrator



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
on epoch 1
['ÿzÿzÿzÿzpppppppppppppppppppppppppppppppppppppppppppppppppppppppp']
Epoch: 1/50, Step: 1000, D Loss: 4.247164270054782e-06, G Loss: -1.0334605393680363e-10 time: 
Epoch: 2/50, Step: 1500, D Loss: 1.3800340070702077e-07, G Loss: -1.6729084784117276e-10 time: 
Epoch: 4/50, Step: 2000, D Loss: 5.590388951759451e-08, G Loss: -2.5562990960126264e-11 time: 
on epoch 5
['kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk']
Epoch: 5/50, Step: 2500, D Loss: 1.1203083793986934e-08, G Loss: -2.8842159563446934e-11 time: 
Epoch: 7/50, Step: 3000, D Loss: 1.865594789762781e-08, G Loss: -2.77068489679122e-11 time: 
Epoch: 9/50, Step: 3500, D Loss: 3.696201678060618e-11, G Loss: -3.143474369693422e-11 time: 
on epoch 10
['jjjjüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüüü']
Epoch: 10/50, Step: 4000, D Loss: 2.825482543741897e-17, G Loss: -5

KeyboardInterrupt: 