In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.stats import skew, kurtosis

In [25]:
df = pd.read_csv('path')

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def preprocess_text(text):
    return text.lower().strip()

train_df['Processed_Review'] = train_df['Review'].apply(preprocess_text)
test_df['Processed_Review'] = test_df['Review'].apply(preprocess_text)

rating_min = train_df['Rating'].min()
rating_max = train_df['Rating'].max()
train_df['Normalized_Rating'] = (train_df['Rating'] - rating_min) / (rating_max - rating_min)
test_df['Normalized_Rating'] = (test_df['Rating'] - rating_min) / (rating_max - rating_min)

# loading the pre-trained  model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# encode text using the transformer and maintain the output shape
def encode_text(text, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # MAKIGN SURE TO MAINTAIN THE SHAPE (768)

# encoding the preprocessed reviews
train_df['Encoded_Review'] = train_df['Processed_Review'].apply(lambda x: encode_text(x))
test_df['Encoded_Review'] = test_df['Processed_Review'].apply(lambda x: encode_text(x))

print("no errors (yet)")


no errors (yet)


In [27]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# adding an attention layer to improve the performance of the model
class AttentionLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        et = tf.squeeze(tf.tanh(tf.matmul(x, self.W) + self.b), axis=-1)
        at = tf.nn.softmax(et)
        at = tf.expand_dims(at, axis=-1)
        output = x * at
        return tf.reduce_sum(output, axis=1)

# creating the multimodal class that combines the numerical and text data with modalitiy specific encoders and decoders
class MultimodalVAE(keras.Model):
    def __init__(self, numerical_dim, text_dim, latent_dim):
        super(MultimodalVAE, self).__init__()
        self.numerical_encoder = self.build_numerical_encoder(numerical_dim, latent_dim)
        self.text_encoder = self.build_text_encoder(text_dim, latent_dim)
        self.numerical_decoder = self.build_numerical_decoder(latent_dim, numerical_dim)
        self.text_decoder = self.build_text_decoder(latent_dim, text_dim)
        self.sampling = layers.Lambda(self.sample, output_shape=(latent_dim,))
        self.latent_dim = latent_dim

    def build_numerical_encoder(self, numerical_dim, latent_dim):
        inputs = keras.Input(shape=(numerical_dim,), dtype=tf.float32)
        x = layers.Dense(64, activation="relu")(inputs)
        x = layers.Dense(32, activation="relu")(x)
        z_mean = layers.Dense(latent_dim)(x)
        z_log_var = layers.Dense(latent_dim)(x)
        return keras.Model(inputs, [z_mean, z_log_var])

    def build_text_encoder(self, text_dim, latent_dim):
        inputs = keras.Input(shape=(text_dim,), dtype=tf.float32)
        x = layers.Dense(128, activation="relu")(inputs)
        x = layers.Reshape((128, -1))(x)
        x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
        x = layers.GlobalAveragePooling1D()(x)
        x = layers.Dense(32, activation="relu")(x)
        z_mean = layers.Dense(latent_dim)(x)
        z_log_var = layers.Dense(latent_dim)(x)
        return keras.Model(inputs, [z_mean, z_log_var])

    def build_numerical_decoder(self, latent_dim, numerical_dim):
        inputs = keras.Input(shape=(latent_dim,))
        x = layers.Dense(32, activation="relu")(inputs)
        x = layers.Dense(64, activation="relu")(x)
        outputs = layers.Dense(numerical_dim)(x)
        return keras.Model(inputs, outputs)

    def build_text_decoder(self, latent_dim, text_dim):
        inputs = keras.Input(shape=(latent_dim,))
        x = layers.RepeatVector(128)(inputs)  # adjusting to 128 timesteps
        x = layers.LSTM(128, return_sequences=True)(x)
        x = AttentionLayer()(x)
        x = layers.Dense(64, activation="relu")(x)
        x = layers.RepeatVector(text_dim)(x)
        outputs = layers.TimeDistributed(layers.Dense(10000, activation="softmax"))(x)
        return keras.Model(inputs, outputs)

    def sample(self, args):
        z_mean, z_log_var = args
        epsilon = tf.random.normal(shape=tf.shape(z_mean))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    def call(self, inputs):
        numerical_input, text_input = inputs
        numerical_z_mean, numerical_z_log_var = self.numerical_encoder(tf.cast(numerical_input, tf.float32))
        text_z_mean, text_z_log_var = self.text_encoder(tf.cast(text_input, tf.float32))

        z_mean = (numerical_z_mean + text_z_mean) / 2
        z_log_var = (numerical_z_log_var + text_z_log_var) / 2
        z = self.sampling((z_mean, z_log_var))

        numerical_output = self.numerical_decoder(z)
        text_output = self.text_decoder(z)

        return numerical_output, text_output, z_mean, z_log_var

    def decode_numerical(self, z):
        return self.numerical_decoder(z)

    def decode_text(self, z):
        return self.text_decoder(z)

# compiling the model with the Adam optimizer
numerical_dim = 1
text_dim = 768
latent_dim = 64
vae = MultimodalVAE(numerical_dim, text_dim, latent_dim)

vae.compile(optimizer=keras.optimizers.Adam())


In [28]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#creating the evolutionary algorithm class 
class Individual:
    def __init__(self, latent_vector):
        self.latent_vector = latent_vector
        self.fitness = None


def improved_genetic_algorithm(original_ratings, population_size=200, num_generations=70, crossover_rate=0.7, mutation_rate=0.01): #changing number of gens and stuff
    # compute the statistical properties of the original data
    original_stats = {
        'mean': original_ratings.mean(),
        'variance': original_ratings.var(),
        'skewness': skew(original_ratings),
        'kurtosis': kurtosis(original_ratings)
    }

    # generate the initial population of synthetic datasets
    initial_population = [np.random.normal(loc=original_stats['mean'], scale=np.sqrt(original_stats['variance']), size=len(original_ratings)) for _ in range(population_size)]

    # Fitness function
    def fitness_function(synthetic_data, original_stats):
        mean_diff = (synthetic_data.mean() - original_stats['mean']) ** 2
        variance_diff = (synthetic_data.var() - original_stats['variance']) ** 2
        skewness_diff = (skew(synthetic_data) - original_stats['skewness']) ** 2
        kurtosis_diff = (kurtosis(synthetic_data) - original_stats['kurtosis']) ** 2
        return mean_diff + variance_diff + skewness_diff + kurtosis_diff

    # evaluate the fitness of the initial population
    fitness_scores = [fitness_function(synthetic_data, original_stats) for synthetic_data in initial_population]







    # MAIN LOOP  - EVOLUTIONARY ALGORITHM
    for generation in range(num_generations):
        # Selection (using tournament selection)
        selected_population = []
        for _ in range(population_size):
            competitors = np.random.choice(population_size, size=3, replace=False)
            winner = min(competitors, key=lambda i: fitness_scores[i])
            selected_population.append(initial_population[winner])

        # Crossover
        new_population = []
        for i in range(0, population_size, 2):
            parent1 = selected_population[i]
            parent2 = selected_population[i + 1]
            if np.random.rand() < crossover_rate:
                crossover_point = np.random.randint(1, len(parent1) - 1)
                child1 = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
                child2 = np.concatenate((parent2[:crossover_point], parent1[crossover_point:]))
            else:
                child1, child2 = parent1, parent2
            new_population.extend([child1, child2])

        # Mutation
        for i in range(population_size):
            if np.random.rand() < mutation_rate:
                mutation_point = np.random.randint(len(new_population[i]))
                new_population[i][mutation_point] += np.random.normal()

        # evaluating the new population
        fitness_scores = [fitness_function(synthetic_data, original_stats) for synthetic_data in new_population]
        initial_population = new_population

        # printing progress
        best_fitness = min(fitness_scores)
        print(f"Generation {generation + 1}/{num_generations}, Best Fitness: {best_fitness:.4f}")

    # selecting the best synthetic dataset
    best_synthetic_data = initial_population[np.argmin(fitness_scores)]

    return best_synthetic_data

# prepare original data for the EA
original_ratings = train_df['Rating'].values

print("done")




# 12 minutes with 100 population size

Improved Genetic Algorithm defined and original data prepared.
everything ran so far well done


In [29]:
import tempfile
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# using gpt2 for THE the text generation
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# preparation fro fine tuning on data above
def prepare_data_for_gpt2(df, text_column):
    texts = df[text_column].tolist()

    # using a temp file - making sure to use utf 8
    with tempfile.NamedTemporaryFile(mode='w+', delete=False, encoding='utf-8') as f:
        for text in texts:
            f.write(f"{text}\n")

    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=f.name,
        block_size=128
    )

    # remove the temp file
    os.unlink(f.name)

    return dataset

train_dataset = prepare_data_for_gpt2(train_df, 'Processed_Review')

# playing with batch size helps with perf
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30, # changed from 3 to 30
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
)

trainer.train()

# making sure to move the model to my GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)







# MAIN GENERATION FUNCTION

def generate_multiple_texts_and_ratings(best_latent_vector, vae, tokenizer, model, device, original_ratings, num_reviews=100, max_length=100):
    synthetic_reviews = []

    # generating the synthetic ratings using the improved EA
    synthetic_ratings = improved_genetic_algorithm(original_ratings, population_size=200, num_generations=100)

    for i in range(num_reviews):
        # perturb the best latent vector slightly
        perturbed_vector = best_latent_vector + np.random.normal(0, 0.1, vae.latent_dim).astype(np.float32)
        perturbed_vector = np.array([perturbed_vector])

        input_ids = tokenizer.encode(tokenizer.bos_token, return_tensors='pt').to(device)
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)

        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

        review = tokenizer.decode(output[0], skip_special_tokens=True)
        synthetic_reviews.append(review)

    return synthetic_reviews, synthetic_ratings



print("generating synthetic data")

# generating the synthetic data
synthetic_ratings = improved_genetic_algorithm(original_ratings, population_size=100, num_generations=50)

# generating synthetic reviews using the now fine tuned GPT2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

synthetic_reviews = []
for _ in range(len(synthetic_ratings)):
    input_ids = tokenizer.encode(tokenizer.bos_token, return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)

    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=100,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )

    review = tokenizer.decode(output[0], skip_special_tokens=True)
    synthetic_reviews.append(review)

# new df with synthetic data
synthetic_df = pd.DataFrame({
    'Review': synthetic_reviews,
    'Rating': synthetic_ratings
})

print("done")
print(synthetic_df.head())

# ENTIRE SCRIPT RUNTIME - SUB 5 MINUTES WITH GPU

# 8 minutes with 20 epochs and 16 batch size

# 11 minutes with 30 epochs and 16 batch size



Step,Training Loss


Generating synthetic data...
Generation 1/50, Best Fitness: 0.0031
Generation 2/50, Best Fitness: 0.0061
Generation 3/50, Best Fitness: 0.0055
Generation 4/50, Best Fitness: 0.0013
Generation 5/50, Best Fitness: 0.0013
Generation 6/50, Best Fitness: 0.0013
Generation 7/50, Best Fitness: 0.0013
Generation 8/50, Best Fitness: 0.0012
Generation 9/50, Best Fitness: 0.0012
Generation 10/50, Best Fitness: 0.0012
Generation 11/50, Best Fitness: 0.0012
Generation 12/50, Best Fitness: 0.0012
Generation 13/50, Best Fitness: 0.0012
Generation 14/50, Best Fitness: 0.0012
Generation 15/50, Best Fitness: 0.0012
Generation 16/50, Best Fitness: 0.0012
Generation 17/50, Best Fitness: 0.0012
Generation 18/50, Best Fitness: 0.0012
Generation 19/50, Best Fitness: 0.0012
Generation 20/50, Best Fitness: 0.0012
Generation 21/50, Best Fitness: 0.0012
Generation 22/50, Best Fitness: 0.0012
Generation 23/50, Best Fitness: 0.0012
Generation 24/50, Best Fitness: 0.0012
Generation 25/50, Best Fitness: 0.0012
Gener

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generation 49/50, Best Fitness: 0.0011
Generation 50/50, Best Fitness: 0.0011


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Synthetic data generated.
                                              Review    Rating
0  \n"the last of the lambs", the last lamban, is...  0.664372
1  the king of kings, a tyrant is visited by the ...  5.311334
2  the film is based on the bestselling novel by ...  5.000716
3  the place where a young woman is murdered and ...  6.294413
4  theresa chiau, a young woman with special need...  8.658176


In [30]:
synthetic_df.head()

Unnamed: 0,Review,Rating
0,"\n""the last of the lambs"", the last lamban, is...",0.664372
1,"the king of kings, a tyrant is visited by the ...",5.311334
2,the film is based on the bestselling novel by ...,5.000716
3,the place where a young woman is murdered and ...,6.294413
4,"theresa chiau, a young woman with special need...",8.658176


In [34]:
synthetic_df.to_csv('path')

In [31]:
df.describe()

Unnamed: 0,Rating
count,404.0
mean,5.429208
std,1.759889
min,1.0
25%,4.5
50%,5.6
75%,6.5
max,10.0


In [33]:
synthetic_df.describe()

Unnamed: 0,Rating
count,323.0
mean,5.417642
std,1.768667
min,-0.5291
25%,4.363274
50%,5.349215
75%,6.600756
max,10.180781


In [8]:
df.head()

Unnamed: 0,Review,Rating
0,Mismatched travellers are stranded overnight a...,8.0
1,"In 1900, strong-willed widow Lucy Muir goes to...",7.7
2,"In 1867, a gang led by James ""Stretch"" Dawson ...",6.9
3,Reclusive Dr. Zorba has died and left his mans...,5.8
4,The twisted Richard III is haunted by the ghos...,6.2
