# GAN Generative Model

### Install Necessary Libraries and Define Functions, Importing the models from Encoder+LSTM Architecture

In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


In [2]:
import numpy as np
import pandas as pd
import re

import torch
import torch.optim as optim
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.nn.functional import softmax, log_softmax
from torch.utils.data import DataLoader, TensorDataset

from tqdm import tqdm
import warnings
from sentence_transformers import SentenceTransformer, util

In [3]:
def clean_text(text):
    text = re.sub(r'http\S+', '[URL]', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '[MENTION]', text)
    text.replace("\n","")
    
    return text

In [4]:
from transformers import BertTokenizer

b_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
b_max_length = 128

def preprocess_text(text):
    text = re.sub(r'http\S+', '[URL]', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '[MENTION]', text)
    
    tokens = b_tokenizer(text, max_length=b_max_length, padding='max_length', truncation=True, return_tensors="pt")
    return tokens['input_ids'], tokens['attention_mask']

special_tokens_dict = {'additional_special_tokens': ['[URL]', '[MENTION]']}
num_added_toks = b_tokenizer.add_special_tokens(special_tokens_dict)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
class MyModel2(nn.Module):
    def __init__(self, bert_model_name, lstm_hidden_size, lstm_layers, num_classes):
        super(MyModel2, self).__init__()
        # BERT Model
        self.bert = BertModel.from_pretrained(bert_model_name)
        bert_output_size = self.bert.config.hidden_size
        # LSTM layer
        self.lstm = nn.LSTM(input_size=bert_output_size, hidden_size=lstm_hidden_size, 
                            num_layers=lstm_layers, batch_first=True)
        # Fully connected layers for combined features
        self.classifier = nn.Sequential(
            nn.Linear(lstm_hidden_size, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        # Handling text data with BERT
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_output.last_hidden_state

        # Passing BERT embeddings to LSTM
        lstm_output, _ = self.lstm(bert_embeddings)
        # Using only the last hidden state of LSTM
        lstm_output = lstm_output[:, -1, :]

        # Classifier
        output = self.classifier(lstm_output)
        return output

### Importing Encoder+LSTM Model as Discriminator

In [7]:
device = "cuda"
lstm_model = torch.load('/kaggle/input/twitter-data-virality/model.pth')

In [8]:
def score_text(text_in):
    input_id, mask = preprocess_text(text_in)
    input_id, mask = input_id.to(device), mask.to(device)
    output = lstm_model(input_id, mask)
    torch.cuda.empty_cache()
    
    return [out[0].item() for out in output]

### Import Processed Twitter data with text and virality

In [10]:
df = pd.read_csv("/kaggle/input/twitter-data-virality/dataset_viralscore_3.csv", low_memory=False)
df['UserFavouritesCount'] = pd.to_numeric(df['UserFavouritesCount'], errors='coerce')
df = df.dropna(subset=['UserFavouritesCount'])
df['mediaCount'] = pd.to_numeric(df['mediaCount'], errors='coerce')
df = df.dropna(subset=['mediaCount'])
df = df.sample(frac=1).reset_index(drop=True)

In [11]:
df_content = pd.DataFrame(df['content'])
df_content = df_content[df_content['content'].apply(lambda x: len(x.split()) >= 10)]
df_content['cleaned'] = df_content['content'].apply(clean_text)

In [12]:
mask = df_content['cleaned'].str.contains(r'\[URL\]')
df_filtered = df_content[~mask]

## Create GPT2 Generator Model, Define Functions to generate new text given input

In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = GPT2LMHeadModel.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer)) 
model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
def alter_tweet(model, tokenizer, tweet, prompt_text):
    
    inputs = tokenizer(prompt_text+tweet+": ", padding=True, return_tensors='pt')
    inputs.to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    prompt_length = input_ids.shape[1]

    # Generate new text

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=prompt_length*2,  # Add a reasonable number of tokens, adjust as needed
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_k=55,
        top_p=0.95,
        no_repeat_ngram_size=2,
        num_return_sequences=1
    )

    # Decode generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    original_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

    # Remove the original part from the generated text if present
    altered_text = generated_text[len(original_text):].strip()
    torch.cuda.empty_cache()
    return altered_text if altered_text else generated_text  # Fallback to generated if slicing fails

In [15]:
base_model = GPT2LMHeadModel.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    base_model.resize_token_embeddings(len(tokenizer)) 
base_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### Test Model Text Generation

In [19]:
model.eval()
prompt_text = "Rewrite this phrase, without changing the meaning: "
sample_tweet = df_filtered['cleaned'].sample(n=1).iloc[0].replace("\n","")
#sample_tweet = "CS6120 by Raj is the best class ever! Highly Recommend!"
print("Original Tweet:", sample_tweet)

Original Tweet: Plan to travel over the mountains to [MENTION] for the #AppleCup2022? Be planning ahead, leave for Pullman earlier on Saturday if possible and be patient. Snow is expected. Make sure to follow traction requirements and drive for conditions.


In [23]:
altered_tweet = alter_tweet(model, tokenizer, sample_tweet, prompt_text)
#altered2_tweet = alter_tweet(model2, tokenizer, sample_tweet, prompt_text)
untrained_tweet = alter_tweet(base_model, tokenizer, sample_tweet, prompt_text)
print("Original Tweet:", sample_tweet)
print("Altered Tweet:", clean_text(altered_tweet).replace("\n",""))
#print("Altered2 Tweet:", clean_text(altered2_tweet).replace("\n",""))
print("Untrained Tweet:", clean_text(untrained_tweet).replace("\n",""))
print(f"Original Tweet Score: {score_text(sample_tweet)[0]:0.4f}")
print(f'Altered Tweet Score: {score_text(altered_tweet)[0]:0.4f}')
#print(f'Altered2 Tweet Score: {score_text(altered2_tweet)[0]:0.4f}')
print(f'Untrained Tweet Score: {score_text(untrained_tweet)[0]:0.4f}')

Original Tweet: Plan to travel over the mountains to [MENTION] for the #AppleCup2022? Be planning ahead, leave for Pullman earlier on Saturday if possible and be patient. Snow is expected. Make sure to follow traction requirements and drive for conditions.
Altered Tweet: I'm going to be a little late. I'm not going. The weather is going bad. It's going good. We're going down the mountain.I am a man. A man who is a woman. And a girl. A man is the man I am. He is my
Untrained Tweet: I'm going to be driving for a few hours on the road, so I'm not going anywhere. I'll be in the car for about an hour. If you're not sure what to do, just drive. It's a good idea to drive in a straight line, but I don't think it
Original Tweet Score: -0.4329
Altered Tweet Score: 0.2006
Untrained Tweet Score: -0.8736


## Training and Verification Loop. Calculate Loss based on Similarity and Virality

In [22]:
def calculate_log_prob(model, tokenizer, inputs, next_token):
    outputs = model(inputs.to(device))
    logits = outputs.logits[:, -1, :]  # Get the logits for the last token position
    log_probs = log_softmax(logits, dim=-1)
    next_token_log_prob = log_probs.gather(1, next_token.to(device).unsqueeze(-1)).squeeze(-1)
    return next_token_log_prob

In [19]:
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.00001)

In [23]:
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_similarity(text1, text2):
    embeddings1 = similarity_model.encode(text1, convert_to_tensor=True, show_progress_bar=False)
    embeddings2 = similarity_model.encode(text2, convert_to_tensor=True, show_progress_bar=False)
    cosine_sim = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cosine_sim.item()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
epochs = 50
device = "cuda"
model.to(device)
model.train()
num_tweets_to_process = 10
prompt_text = prompt_text

alpha = 1
beta = 0

max_length = 128
for epoch in range(epochs):
    shuffled_tweets = df_filtered['cleaned'].sample(frac=1).iloc[:num_tweets_to_process]
    total_reward = 0
    for tweet in tqdm(shuffled_tweets, desc=f"Epoch {epoch + 1} Progress"):
        origin_score = torch.sigmoid(torch.tensor(score_text(tweet)[0]))
        altered_text = alter_tweet(model, tokenizer, tweet, prompt_text)
        content_score = torch.sigmoid(torch.tensor(score_text(altered_text)[0]))
        similarity_score = calculate_similarity(tweet, altered_text)
        
        #Either Base the reward on Similarity-Virality Combination or Score Improvement
        #reward = alpha*content_score + beta*((similarity_score - 0.5)*2)
        reward = content_score - origin_score

        inputs = tokenizer(altered_text, return_tensors='pt')
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        model.zero_grad()
        total_log_prob = torch.tensor(0.0, device=device).unsqueeze(0) 
        
        #Calculate the Log Probability of the given altered text sequence
        for i in range(1, input_ids.size(1)):
            current_input = input_ids[:, :i]
            next_token = input_ids[:, i]
            log_prob = calculate_log_prob(model, tokenizer, current_input, next_token)
            total_log_prob += log_prob       
        
        total_reward += reward

        loss = -total_log_prob * reward
        if loss.item() != 0:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        torch.cuda.empty_cache()
    torch.cuda.empty_cache()
    print(f"Epoch {epoch + 1}, Total Reward: {total_reward}")

Epoch 1 Progress: 100%|██████████| 10/10 [00:23<00:00,  2.37s/it]


Epoch 1, Total Reward: -0.09869101643562317


Epoch 2 Progress: 100%|██████████| 10/10 [00:16<00:00,  1.60s/it]


Epoch 2, Total Reward: 0.6804487705230713


Epoch 3 Progress: 100%|██████████| 10/10 [00:18<00:00,  1.81s/it]


Epoch 3, Total Reward: -0.20026174187660217


Epoch 4 Progress: 100%|██████████| 10/10 [00:19<00:00,  1.92s/it]


Epoch 4, Total Reward: -0.20271167159080505


Epoch 5 Progress: 100%|██████████| 10/10 [00:15<00:00,  1.58s/it]


Epoch 5, Total Reward: 0.26110512018203735


Epoch 6 Progress: 100%|██████████| 10/10 [00:19<00:00,  1.98s/it]


Epoch 6, Total Reward: 0.14610505104064941


Epoch 7 Progress: 100%|██████████| 10/10 [00:14<00:00,  1.42s/it]


Epoch 7, Total Reward: 0.5619194507598877


Epoch 8 Progress: 100%|██████████| 10/10 [00:19<00:00,  1.97s/it]


Epoch 8, Total Reward: 0.639773964881897


Epoch 9 Progress: 100%|██████████| 10/10 [00:18<00:00,  1.81s/it]


Epoch 9, Total Reward: 0.33377277851104736


Epoch 10 Progress: 100%|██████████| 10/10 [00:17<00:00,  1.78s/it]


Epoch 10, Total Reward: 1.0590357780456543


Epoch 11 Progress: 100%|██████████| 10/10 [00:19<00:00,  1.90s/it]


Epoch 11, Total Reward: 1.0587314367294312


Epoch 12 Progress: 100%|██████████| 10/10 [00:22<00:00,  2.24s/it]


Epoch 12, Total Reward: 0.2766219973564148


Epoch 13 Progress: 100%|██████████| 10/10 [00:21<00:00,  2.14s/it]


Epoch 13, Total Reward: 0.0862874686717987


Epoch 14 Progress: 100%|██████████| 10/10 [00:22<00:00,  2.29s/it]


Epoch 14, Total Reward: 0.3303617238998413


Epoch 15 Progress: 100%|██████████| 10/10 [00:16<00:00,  1.68s/it]


Epoch 15, Total Reward: 1.0817337036132812


Epoch 16 Progress: 100%|██████████| 10/10 [00:18<00:00,  1.86s/it]


Epoch 16, Total Reward: 0.051283687353134155


Epoch 17 Progress: 100%|██████████| 10/10 [00:18<00:00,  1.83s/it]


Epoch 17, Total Reward: 0.2521398067474365


Epoch 18 Progress: 100%|██████████| 10/10 [00:19<00:00,  1.98s/it]


Epoch 18, Total Reward: 0.32885897159576416


Epoch 19 Progress: 100%|██████████| 10/10 [00:16<00:00,  1.67s/it]


Epoch 19, Total Reward: 0.5815116763114929


Epoch 20 Progress: 100%|██████████| 10/10 [00:17<00:00,  1.76s/it]


Epoch 20, Total Reward: 0.3492371439933777


Epoch 21 Progress: 100%|██████████| 10/10 [00:19<00:00,  1.95s/it]


Epoch 21, Total Reward: 0.3700333535671234


Epoch 22 Progress: 100%|██████████| 10/10 [00:18<00:00,  1.83s/it]


Epoch 22, Total Reward: 0.03997910022735596


Epoch 23 Progress: 100%|██████████| 10/10 [00:15<00:00,  1.54s/it]


Epoch 23, Total Reward: 0.6178473830223083


Epoch 24 Progress: 100%|██████████| 10/10 [00:18<00:00,  1.83s/it]


Epoch 24, Total Reward: 0.7420645952224731


Epoch 25 Progress: 100%|██████████| 10/10 [00:19<00:00,  1.90s/it]


Epoch 25, Total Reward: 0.49961385130882263


Epoch 26 Progress:  20%|██        | 2/10 [00:03<00:14,  1.79s/it]


KeyboardInterrupt: 

### Save or Load Previous Models as Necessary

In [16]:
model = torch.load('/kaggle/input/twitter-data-virality/generator_model_2.pth')

In [34]:
model2 = torch.load('/kaggle/input/twitter-data-virality/generator_model_2.pth')

In [48]:
torch.save(model, '/kaggle/working/generator_model_2.pth')