# Final Project - Data Processing

Run this cell to mount to google drive and ensure that cuda is available

In [None]:
import os, sys
from google.colab import drive

# help with mounting:
# https://edstem.org/us/courses/69019/discussion/6372778?comment=14964937 (Ed #1086)
# also other assignment notebooks
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/OMSCS/finalproject/'

sys.path.append('/content/drive/MyDrive/OMSCS/finalproject/')

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("You are using device: %s" % device)

In [None]:
!pip install rouge-score

## Data Cleaning

Use the following code to import all necessary modules, set global data, and define a few utility methods.

These utilities should be the only ones in a separate file.

In [None]:
# IMPORTS
# only custom import is data_processor.py
from data_processor import DataEncoder, Vocabulary, create_word_lookup_csv, create_clean_csv, get_encoded_bill_data
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import yaml

# these paths are for the original data
data_csv = "data/orig_data.csv"
train_csv = "data/train_data.csv"

# these paths will be used to generate cleaner data
clean_data_csv = "data/orig_data_clean.csv"
clean_train_csv = "data/train_data_clean.csv"
encoded_train_csv = "data/encoded_train.csv"

# THESE CAN BE MODIFIED - add data path
word_lookup_csv = "word_lookup.csv"

# special tokens!
special_tokens = ['<pad>', '<unk>', '<start>', '<stop>']
pad_idx = 0
train_percent = 0.8

# cleaning methods
def clean_all_data():
    create_clean_csv(data_csv, clean_data_csv)
    create_clean_csv(data_csv, clean_train_csv)


def create_word_lookup(n_vocab=None, include_nums=True):
    n = n_vocab
    if n_vocab == None:
      n = "all"
    lookup_csv = f"data/{n}_{word_lookup_csv}"
    create_word_lookup_csv(clean_data_csv, lookup_csv, special_tokens, n_vocab)
    return lookup_csv


def create_encoded_data(vocab):
    d = DataEncoder(vocab, 2, 3)
    d.encode_data_to_csv(clean_train_csv, encoded_train_csv)


def get_config(config_file):
    # get configurations
    # source: assignment 4 code
    with open(config_file, "r") as file:
        config_dict = yaml.safe_load(file)
    return config_dict


Only run this cell if you need to rebuild any clean csv files.

In [None]:
#clean_all_data()




---



## Configuration

Set all config variables. You could use a config file, but for debugging purposes, it's easier to set them here:

In [None]:
# SET CONFIGS (easier than using config file)
max_len = 256
max_vocab = 1024

## Vocabulary and Data

Run this code to get all necessary training data. Ensure that the config file is updated appropriately.

In [None]:
# create a word lookup using ALL words - should only need to run once
# always make all csv with ALL data, including numbers
#all_lookup_csv = create_word_lookup()
all_lookup_csv = "data/all_word_lookup.csv"
all_vocab = Vocabulary(all_lookup_csv)
print(all_lookup_csv)
print(len(all_vocab))

In [None]:
# create encoded data given all vocab - should only need to run once
# always encode ALL data using ALL vocab
#create_encoded_data(all_vocab)
print(encoded_train_csv)

In [None]:
# create a word lookup using ONLY top max_vocab words, may or may not include numbers
#n_lookup_csv = create_word_lookup(max_vocab)
n_lookup_csv = "data/1024_word_lookup.csv"
n_vocab = Vocabulary(n_lookup_csv)
len_vocab = len(n_vocab)
print(n_lookup_csv)
print(len_vocab)

In [None]:
# get data
# data will be encoded with <unk> if it is out of the range of n_vocab
train_data, valid_data = get_encoded_bill_data(encoded_train_csv, train_percent,
                                               max_len, max_len, 0, 1, 3, len_vocab)
print("train_data:", len(train_data))
print("valid_data:", len(valid_data))



## Training / Translation Definitions

Below are the definitions for training loops, evalations, and translations

In [None]:
#####################################################
# Training Loops                                    #
#####################################################
import torch
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer, scoring

# source for training:
# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
# Assignment 3: utils.py code
def train_loop(model, train_dataloader, valid_dataloader, optimizer, loss_fn, epochs, device, vocab, plot_name):
    plot_file = f"plot_{plot_name}.png"
    avg_train_loss_scores = []
    avg_valid_loss_scores = []

    for e in range(epochs):
        print(f"----- EPOCH {e} -----")

        train_loss, avg_train_loss = train_single_epoch(model, train_dataloader, optimizer, loss_fn, device, vocab)
        avg_train_loss_scores.append(avg_train_loss)
        print(f"Train Loss: {train_loss}, Average Loss: {avg_train_loss}")

        valid_loss, avg_valid_loss = evaluate_enc(model, valid_dataloader, loss_fn, device, vocab)
        avg_valid_loss_scores.append(avg_valid_loss)
        print(f"Validation Loss: {valid_loss}, Average Loss: {avg_valid_loss}")

    print("Completed Train Loop");
    plot_and_print_data(epochs, avg_train_loss_scores, avg_valid_loss_scores, plot_file, "loss")


def plot_and_print_data(epochs, train_data, valid_data, fname, plot_type):
    # print data
    print(f"Train {plot_type}:", train_data)
    print(f"Validation {plot_type}:", valid_data)

    # set labels
    y_label = "Error"
    if plot_type == "loss":
      y_label = "Average Loss"
    if plot_type == "rouge":
      y_label = "Rouge Scores"

    title_str = f"{y_label} Over {epochs} Epochs"
    plot_file = f"{plot_type}_{fname}"

    # plotting sources:
    # https://pythonguides.com/python-plot-multiple-lines/
    # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html
    x = [i for i in range(epochs)]
    plt.plot(x, train_data, label="Training")
    plt.plot(x, valid_data, label="Validation")
    plt.xlabel("Epochs")
    plt.ylabel(y_label)
    plt.legend()
    plt.title(title_str)
    plt.tight_layout()
    plt.savefig(plot_file)
    plt.clf()

    print(f"Plot Saved: {plot_file}")


# source for training:
# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
# Assignment 3: utils.py code
def train_single_epoch(model, dataloader, optimizer, loss_fn, device, vocab):
    total_loss = 0.
    model.train()
    for i, data in enumerate(dataloader):
        # text shape is (batch_size, max_len_text)
        # summary shape is (batch_size, max_len_summary)
        text = data[0].to(device)
        tgt_summary = data[1].long().to(device)
        batch_size, max_len = tgt_summary.shape

        if (i+1) % 100 == 0:
          print(f"batch {i+1}")

        # zero grad
        optimizer.zero_grad()

        # get outputs
        out_summary = model(text, tgt_summary)

        if i == 0:
          # sanity checks
          print(f"Train Tgt {i}:",  translate_single_sample(vocab, tgt_summary[0]))
          print(f"Train Out {i}:", translate_single_pred_sample(vocab, out_summary[0]))
          print("out_summary shape", out_summary.shape)
          print("out_summary[0]", out_summary[0])

        # reshape
        out_summary = out_summary.reshape(-1, out_summary.shape[-1])
        tgt_summary = tgt_summary.reshape(-1)

        # compute loss
        loss = loss_fn(out_summary, tgt_summary)
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return total_loss, avg_loss


# source for evaluating:
# Assignment 3: utils.py code
def evaluate_enc(model, dataloader, loss_fn, device, vocab):
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for i, data in enumerate(dataloader):
            # text shape is (batch_size, max_len_text)
            # summary shape is (batch_size, max_len_summary)
            text, summary = data
            text = data[0].to(device)
            tgt_summary = data[1].long().to(device)
            batch_size, max_len = tgt_summary.shape

            if (i+1) % 100 == 0:
              print(f"batch {i+1}")

            # get outputs
            out_summary = model(text, tgt_summary)

            if i == 0:
              # sanity checks
              print(f"Valid Tgt {i}:",  translate_single_sample(vocab, tgt_summary[0]))
              print(f"Valid Out {i}:", translate_single_pred_sample(vocab, out_summary[0]))
              print("out_summary shape", out_summary.shape)
              print("out_summary[0]", out_summary[0])

            # reshape
            out_summary = out_summary.reshape(-1, out_summary.shape[-1])
            tgt_summary = tgt_summary.reshape(-1)

            # compute loss
            loss = loss_fn(out_summary, tgt_summary)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return total_loss, avg_loss


## Model Definitions


## Main Program

## Sources


In [None]:
# SOURCES
# Assignment 3: utils.py code
# Assignment 3: Machine_Translation.ipynb
# Assignment 3 Transformer.py
# partner Santiago's utils.py
# https://edstem.org/us/courses/69019/discussion/6372778?comment=14964937
# https://edstem.org/us/courses/69019
# https://edstem.org/us/courses/69019/discussion/6188507
# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
# https://pythonguides.com/python-plot-multiple-lines/
# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html
# https://pytorch.org/tutorials/beginner/introyt/trainingyt.html
# https://www.geeksforgeeks.org/python-string-join-method//discussion/6188507
# https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
# https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
# https://github.com/pytorch/examples/blob/main/word_language_model/model.py
# https://stackoverflow.com/questions/41488279/neural-network-always-predicts-the-same-class