Set up basics:
1. Keras Library
2. Mixed precision training to save time

In [None]:
!pip install keras-nlp -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/584.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/584.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.5/584.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m109.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
import keras_nlp
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
import time

from tensorflow import keras

policy = keras.mixed_precision.Policy("mixed_float16")
keras.mixed_precision.set_global_policy(policy)

Using TensorFlow backend


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


Hyperparameters

In [None]:
# General hyperparameters
BATCH_SIZE = 32
NUM_BATCHES = 500
EPOCHS = 1  # Can be set to a higher value for better results
MAX_SEQUENCE_LENGTH = 128
MAX_GENERATION_LENGTH = 200

GPT2_PRESET = "gpt2_base_en"

# LoRA-specific hyperparameters
RANK = 4
ALPHA = 32.0



Load in the left, right, neutral dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Modify the local path to the project

In [None]:
cd drive/MyDrive/cs182-final-proj

/content/drive/.shortcut-targets-by-id/1fHTJEwQpdw9d5-ck7bXjejartUco15Ex/cs182-final-proj


In [None]:
import tensorflow as tf

# Read CSV and process the data
df = pd.read_csv("combined_data.csv")
df = df.dropna()

# Create lists to store the articles
left_articles = []
right_articles = []
neu_articles = []

# Iterate through the dataframe and append articles to the lists
for idx, item in df.iterrows():
    article = item["text"]
    if item['type'] == 'center':
        neu_articles.append(article)
    elif item['type'] == 'left':
        left_articles.append(article)
    elif item['type'] == 'right':
        right_articles.append(article)

# Convert lists to tf.data.Dataset objects
left_data = tf.data.Dataset.from_tensor_slices(left_articles)
right_data = tf.data.Dataset.from_tensor_slices(right_articles)
neu_data = tf.data.Dataset.from_tensor_slices(neu_articles)

# Now you can apply the dataset operations
BATCH_SIZE = 32  # Set your batch size
AUTOTUNE = tf.data.AUTOTUNE



As a test, the model is only trained on batched left_data

In [None]:
train_ds = (
    left_data
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)
train_ds = train_ds.take(NUM_BATCHES)

Text generation function

In [None]:

def generate_text(model, input_text, max_length=200):
    start = time.time()

    output = model.generate(input_text, max_length=max_length)
    print("\nOutput:")
    print(output)

    end = time.time()
    print(f"Total Time Elapsed: {end - start:.2f}s")




We will use AdamW optimizer and cross-entropy loss for training both models.

In [None]:

def get_optimizer_and_loss():
    optimizer = keras.optimizers.AdamW(
        learning_rate=5e-5,
        weight_decay=0.01,
        epsilon=1e-6,
        global_clipnorm=1.0,  # Gradient clipping.
    )
    # Exclude layernorm and bias terms from weight decay.
    optimizer.exclude_from_weight_decay(var_names=["bias"])
    optimizer.exclude_from_weight_decay(var_names=["gamma"])
    optimizer.exclude_from_weight_decay(var_names=["beta"])

    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    return optimizer, loss


In [None]:
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=MAX_SEQUENCE_LENGTH,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

gpt2_lm.summary()

Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/vocab.json
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/merges.txt
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/model.h5


Training and text generation:

In [None]:
gpt2_lm.fit(train_ds, epochs=EPOCHS)



<keras.src.callbacks.History at 0x7dee446692a0>

In [None]:
generate_text(gpt2_lm, "Trump", max_length=150)


Output:
Trump, who has said he is committed to a "total and complete shutdown of Muslims entering the United States until our country's representatives can figure out what is going on," is now trying to make a political case to the American people that the Muslim ban is not an attack.
Total Time Elapsed: 35.19s


In [None]:
import math


class LoraLayer(keras.layers.Layer):
    def __init__(
        self,
        original_layer,
        rank=8,
        alpha=32,
        trainable=False,
        **kwargs,
    ):
        # We want to keep the name of this layer the same as the original
        # dense layer.
        original_layer_config = original_layer.get_config()
        name = original_layer_config["name"]

        kwargs.pop("name", None)

        super().__init__(name=name, trainable=trainable, **kwargs)

        self.rank = rank
        self.alpha = alpha

        self._scale = alpha / rank

        self._num_heads = original_layer_config["output_shape"][-2]
        self._hidden_dim = self._num_heads * original_layer_config["output_shape"][-1]

        # Layers.

        # Original dense layer.
        self.original_layer = original_layer
        # No matter whether we are training the model or are in inference mode,
        # this layer should be frozen.
        self.original_layer.trainable = False

        # LoRA dense layers.
        self.A = keras.layers.Dense(
            units=rank,
            use_bias=False,
            # Note: the original paper mentions that normal distribution was
            # used for initialization. However, the official LoRA implementation
            # uses "Kaiming/He Initialization".
            kernel_initializer=keras.initializers.VarianceScaling(
                scale=math.sqrt(5), mode="fan_in", distribution="uniform"
            ),
            trainable=trainable,
            name=f"lora_A",
        )
        # B has the same `equation` and `output_shape` as the original layer.
        # `equation = abc,cde->abde`, where `a`: batch size, `b`: sequence
        # length, `c`: `hidden_dim`, `d`: `num_heads`,
        # `e`: `hidden_dim//num_heads`. The only difference is that in layer `B`,
        # `c` represents `rank`.
        self.B = keras.layers.EinsumDense(
            equation=original_layer_config["equation"],
            output_shape=original_layer_config["output_shape"],
            kernel_initializer="zeros",
            trainable=trainable,
            name=f"lora_B",
        )

    def call(self, inputs):
        original_output = self.original_layer(inputs)
        if self.trainable:
            # If we are fine-tuning the model, we will add LoRA layers' output
            # to the original layer's output.
            lora_output = self.B(self.A(inputs)) * self._scale
            return original_output + lora_output

        # If we are in inference mode, we "merge" the LoRA layers' weights into
        # the original layer's weights - more on this in the text generation
        # section!
        return original_output


Insert LoRA layer into the old model:

- Delete previous model;
- Load a new GPT-2 model.

In [None]:
del gpt2_lm
del optimizer
del loss

# Load the original model.
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
lora_model = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en",
    preprocessor=preprocessor,
)

Override the original query/value projection matrices with
new LoRA layers.

In [None]:
for layer_idx in range(lora_model.backbone.num_layers):
    # Change query dense layer.
    decoder_layer = lora_model.backbone.get_layer(f"transformer_layer_{layer_idx}")
    self_attention_layer = decoder_layer._self_attention_layer

    # Change query dense layer.
    self_attention_layer._query_dense = LoraLayer(
        self_attention_layer._query_dense,
        rank=RANK,
        alpha=ALPHA,
        trainable=True,
    )

    # Change value dense layer.
    self_attention_layer._value_dense = LoraLayer(
        self_attention_layer._value_dense,
        rank=RANK,
        alpha=ALPHA,
        trainable=True,
    )

Freeze the entire LLM, only the LoRA layers should be trainable.

In [None]:
for layer in lora_model._flatten_layers():
    lst_of_sublayers = list(layer._flatten_layers())

    if len(lst_of_sublayers) == 1:  # "leaves of the model"
        if layer.name in ["lora_A", "lora_B"]:
            layer.trainable = True
        else:
            layer.trainable = False

Print the model's summary and see if the number of non-trainable parameters and
total parameters are correct.

In a previous section, we had calculated the number of parameters associated with
the LoRA layers to be 6,144. The total trainable parameters in the model should
be `num_layers * (query, value) * 6,144 = 12 * 2 * 6,144 = 147,456`. The
number of non-trainable parameters should be the same as the total number of
parameters in the original GPT-2 model, which is `124,439,808`.

In [None]:
lora_model.summary()

In [None]:
optimizer, loss = get_optimizer_and_loss()

lora_model.compile(
    optimizer=optimizer,
    loss=loss,
    weighted_metrics=["accuracy"],
)

lora_model.fit(
    train_ds,
    epochs=EPOCHS,
)



<keras.src.callbacks.History at 0x7dee2fc2bc40>

In [None]:
for layer_idx in range(lora_model.backbone.num_layers):
    self_attention_layer = lora_model.backbone.get_layer(
        f"transformer_layer_{layer_idx}"
    )._self_attention_layer

    # Merge query dense layer.
    query_lora_layer = self_attention_layer._query_dense

    A_weights = query_lora_layer.A.kernel  # (768, 1) (a, b)
    B_weights = query_lora_layer.B.kernel  # (1, 12, 64) (b, c, d)
    increment_weights = tf.einsum("ab,bcd->acd", A_weights, B_weights) * (ALPHA / RANK)
    query_lora_layer.original_layer.kernel.assign_add(increment_weights)

    # Merge value dense layer.
    value_lora_layer = self_attention_layer._value_dense

    A_weights = value_lora_layer.A.kernel  # (768, 1) (a, b)
    B_weights = value_lora_layer.B.kernel  # (1, 12, 64) (b, c, d)
    increment_weights = tf.einsum("ab,bcd->acd", A_weights, B_weights) * (ALPHA / RANK)
    value_lora_layer.original_layer.kernel.assign_add(increment_weights)

Text generation with LoRA:

In [None]:
generate_text(
    lora_model, "Trump 2024", max_length=MAX_GENERATION_LENGTH
)

generate_text(
    lora_model, "Trump 2024", max_length=MAX_GENERATION_LENGTH
)

generate_text(
    lora_model, "Trump 2024", max_length=MAX_GENERATION_LENGTH
)


Output:
Trump 2024 campaign: 'We're going to build it' — as Trump says

The president-elect is now taking a more aggressive stance on infrastructure, calling on the federal government to invest in infrastructure to help pay for infrastructure.

Trump, who has been criticized for failing to deliver on his campaign promises, has pledged to invest at least $100 billion in infrastructure in the next 10 years, including $10 billion on highways, bridges and airports, as well as $5 billion in new public transit and $2 billion to build new airports.

The president-elect's campaign has been trying to drum up support for infrastructure investments for years.

Trump recently said he wants to build a wall along the southern border with Mexico, but he has yet to announce any plans for that wall.

The White House has said it would build a wall along the southern border with Mexico, but has yet to announce any plans for the wall.

On Monday, the president
Total Time Elapsed: 71.76s

Output:
Trump 20