## Setup Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
cd drive/MyDrive/Colab\ Notebooks/4chanParser


/content/drive/MyDrive/Colab Notebooks/4chanParser


In [None]:
pip install -r requirements.txt

## Import and Export

### Save Model

In [None]:
happy_gen.save("./model/")

### Load Model

In [None]:
from happytransformer import HappyGeneration, GENSettings
happy_gen = HappyGeneration("GPT-NEO", "EleutherAI/gpt-neo-125M", load_path="model/")

04/25/2022 17:43:19 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [None]:
from happytransformer import HappyGeneration, GENSettings
happy_gen = HappyGeneration("GPT-NEO", "EleutherAI/gpt-neo-1.3B", load_path="model/")

## Parse 

In [None]:
import requests
from text_preprocessing import preprocess
import time

# make sure we dont get data twice with a timestamp of the last scraping we write a timestamp to a file for this.
lastUpdate = int(open("data/time/lastUpdate.txt", "r").read())
now = int(time.time())

boards = ["a", "b", "pol", "r9k", "s4s"]

for board in boards:
    pages = requests.get(f"https://a.4cdn.org/{board}/threads.json").json()
    with open(f'./data/{board}_{lastUpdate}.txt', 'w') as f:
        for page in pages:
            for thread in page['threads']:
                threadNr = thread['no']
                if int(thread['last_modified']) < lastUpdate:
                    # skip if there is nothing new for us to scrape
                    continue

                try:
                    threadJson = requests.get(
                        f"https://a.4cdn.org/{board}/thread/{threadNr}.json").json()
                    for post in threadJson['posts']:
                        if int(post['time']) < lastUpdate:
                            # only get posts that are newer than the last update
                            continue
                        try:
                            postText = preprocess(post['com'])
                            if postText:
                                f.write(f"{postText}\n")
                        except KeyError:
                            continue

                except requests.JSONDecodeError:
                    continue

with open(f"data/time/lastUpdate.txt", "w") as f:
    f.write(str(now))

## Train

In [3]:
from happytransformer import HappyGeneration, GENSettings
happy_gen = HappyGeneration("GPT-NEO", "EleutherAI/gpt-neo-125M", load_path="model/")

04/29/2022 16:25:04 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [None]:
import os
import torch
from happytransformer import HappyGeneration, GENTrainArgs

# happy_gen = HappyGeneration("GPT-NEO", "EleutherAI/gpt-neo-125M")

args = GENTrainArgs(num_train_epochs=1)

directory = './data'
movedir = './data/parsed'

for file in os.listdir(directory):
    if file.endswith('.txt'):
      fullpath = os.path.join(directory, file)
      happy_gen.train(fullpath)
      newpath = os.path.join(movedir, file)
      
      happy_gen.save("./model/")
      os.rename(fullpath, newpath)

# Run

In [None]:
from happytransformer import HappyGeneration, GENSettings
happy_gen = HappyGeneration("GPT-NEO", "EleutherAI/gpt-neo-1.3B", load_path="model/")

04/26/2022 21:07:31 - INFO - happytransformer.happy_transformer -   Using model: cpu


In [None]:
settings = GENSettings(
  do_sample=True, 
  early_stopping=True,
  top_k=100, 
  temperature=0.7,
  no_repeat_ngram_size=2,
  max_length=64,
  )

In [None]:
phrase = ""
result = happy_gen.generate_text(phrase,  args=settings)
print(phrase + result.text)