In [1]:
import pandas as pd
import sklearn
import string
import numpy as np
import matplotlib.pyplot as plt
import pickle
from convokit import Corpus, download


In [None]:
corpus = Corpus(filename=download("conversations-gone-awry-corpus"))
# df['messages'] = df['messages'].apply(lambda x: x[8:])


In [None]:
corpus.print_summary_stats()

In [None]:
from pandas import json_normalize
import json

In [None]:
jsonl_file = 'utterances.jsonl'

with open(jsonl_file, 'r') as file:
    data = [json.loads(line) for line in file]

flat_data = json_normalize(data)

# Convert flattened JSON data to DataFrame
df = pd.DataFrame(flat_data)
columns_to_drop = [column for column in df.columns if column != 'text']
df = df.drop(columns=columns_to_drop)
# Save DataFrame to CSV file
csv_file = 'provoking_file.csv'
df.to_csv(csv_file, index=False)

In [None]:
df.head()

In [None]:
def remove_pun_and_numbers(text):
    # Combine punctuation and digits
    text = str(text)
    to_remove = string.punctuation + string.digits
    text = text.lower()
    return text.translate(str.maketrans('', '', to_remove))

In [None]:
def load_model():
    with open('model.pkl', 'rb') as file:
        model = pickle.load(file)
    return model
model = load_model()

In [None]:
# Add the new column to the DataFrame
def only_negatives(message):
    result = model.predict([message])
    if result == ['Negative']:
        return message
    else:
        return None

In [None]:
df['text'] = df['text'].apply(only_negatives)


In [None]:
print(len(df.text))
print(df.head())

In [None]:
df = df.dropna(subset=['text'])
df.head()

In [None]:
# make rows 0,1,2,3 etc.
df = df.reset_index(drop=True)
len(df.text)

In [None]:
split_index = len(df) // 2
output_df = df.iloc[split_index:].copy()
input_df = df.iloc[:split_index].copy()

df.info()

In [None]:
# Rename the 'text' column to 'input' in input_df and to 'output' in output_df
output_df.rename(columns={'text': 'output'}, inplace=True)
input_df.rename(columns={'text': 'input'}, inplace=True)

output_df.reset_index(drop=True, inplace=True)
input_df.reset_index(drop=True, inplace=True)

# Concatenate input_df and output_df vertically
df = pd.concat([input_df, output_df], axis=1)
df.output.apply(remove_pun_and_numbers)
df.input.apply(remove_pun_and_numbers)

df.head()


In [None]:
df.info()

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load pre-trained model and tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


# Load the dataset

# Concatenate input and output messages, separated by a special token
eos_token = tokenizer.eos_token
df["text"] = df["input"] + eos_token + df["output"]

# Save the processed dataset as a text file
df["text"].to_csv("processed_dataset.txt", index=False, header=False)

# Create a TextDataset and DataCollator
dataset = TextDataset(tokenizer=tokenizer, file_path="processed_dataset.txt", block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="chatbot_output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    # changed device train batch size from 4 -> 1, less memory demanding
    per_device_train_batch_size=1,
    save_steps=10_000,
    gradient_accumulation_steps = 1,
    save_total_limit=2,
    logging_steps=500,
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()
