In [1]:
import pandas as pd
import numpy as np
import re



In [2]:
df = pd.read_csv("../combined_data.csv")
df = df.dropna()
left_data = open('../data/left.txt', 'w')
right_data = open('../data/right.txt', 'w')
neu_data = open('../data/neutral.txt', 'w')
for idx, item in df.iterrows():
  article = item["text"]
  #article = cleaning(item["text"])
  if item['type'] == 'center':
    neu_data.write(article)
  elif item['type'] == 'left':
    left_data.write(article)
  elif item['type'] == 'right':
    right_data.write(article)

left_data.close()
right_data.close()
neu_data.close()

In [4]:

!pip install transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.10.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.8 kB)
Co

In [5]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,

  )

  trainer.train()
  trainer.save_model()
  print(trainer.state.log_history)


In [12]:
# you need to set parameters
# left-leaning model
train_file_paths = ["left.txt",
                    "right.txt",
                    "neutral.txt"]

model_name = 'gpt2'

output_dir = ['left-weights',
              'right-weights',
              'neutral-weights']
overwrite_output_dir = True #be careful here, set to true if wanna start from scratch, else set false
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [13]:
pip install accelerate -U



In [14]:
# It takes about 30 minutes to train in colab.
for i in range(3):
  train(
      train_file_path=train_file_paths[i],
      model_name=model_name,
      output_dir=output_dir[i],
      overwrite_output_dir=overwrite_output_dir,
      per_device_train_batch_size=per_device_train_batch_size,
      num_train_epochs=num_train_epochs,
      save_steps=save_steps
  )



Step,Training Loss


[{'train_runtime': 48.7872, 'train_samples_per_second': 23.572, 'train_steps_per_second': 2.972, 'total_flos': 75121459200000.0, 'train_loss': 3.6110107421875, 'epoch': 5.0, 'step': 145}]




Step,Training Loss


[{'train_runtime': 43.6151, 'train_samples_per_second': 24.991, 'train_steps_per_second': 3.21, 'total_flos': 71202078720000.0, 'train_loss': 3.6652369907924105, 'epoch': 5.0, 'step': 140}]




Step,Training Loss


[{'train_runtime': 14.9576, 'train_samples_per_second': 24.068, 'train_steps_per_second': 3.009, 'total_flos': 23516282880000.0, 'train_loss': 3.7403937445746527, 'epoch': 5.0, 'step': 45}]


In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length, model_type):

    model_path = model_type
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
sequence = input()
max_len = int(input())
generate_text(sequence, max_len, 'right-weights')
