In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip3 install -U git+https://github.com/huggingface/accelerate.git

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, AutoConfig
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('/kaggle/input/enron/enron7.csv')
df.head(5)

In [None]:
sampled_df=df.sample(n = 200, random_state=42)
df = df.drop(sampled_df.index)
sampled_df = sampled_df.reset_index(drop=True)
df = df.reset_index(drop=True)

In [None]:
base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# special tokens are defined
bos = '<|endoftext|>'
eos = '<|EOS|>'
body = '<|body|>'
additional_special_tokens = [body]

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': '<pad>',
                       'sep_token': body} 
                      #  'additional_special_tokens':additional_special_tokens}

# the new token is added to the tokenizer
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# model configuration to which we add the special tokens
config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    sep_token_id=base_tokenizer.sep_token_id,
                                    output_hidden_states=False)

# we load the pre-trained model with custom settings
base_model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# model embeding resizing
base_model.resize_token_embeddings(len(base_tokenizer))

In [None]:
df['Received'] = df['Received'].astype(str)
df['Response'] = df['Response'].astype(str)

In [None]:
max_sequence_length = 512  # Maximum sequence length

prepare_text = lambda x: ' '.join([bos, x['Received'], body, x['Response'], eos])[:max_sequence_length]
df['text'] = df.apply(prepare_text, axis=1)

In [None]:
df_train_news, df_val_news = train_test_split(df, train_size = 0.9, random_state = 77)

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df_train_news[['text']])
val_dataset = Dataset.from_pandas(df_val_news[['text']])

In [None]:
def tokenize_function(examples):
        return base_tokenizer(examples['text'], padding=True)

In [None]:
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

In [None]:
model_path = './email_v2'

training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    num_train_epochs=20,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_path,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=10000
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = Trainer(
    model=base_model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset
    
)


In [None]:
trainer.train()

In [None]:
trainer.save_model()
base_tokenizer.save_pretrained(model_path)

In [None]:
def pretty_print(text, max_len_line=100):
    words = text.split(' ')
    len_line = 0
    line = ''
    for w in words:
        if w == '\n':
            print(line)
            line = ''
            continue
        if (len(line) + len(w)) > max_len_line:
            print(line)
            line = ''
        line += ' ' + w
    print(line)

In [None]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples = 5):
    text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    generated_text_samples = model.generate(
        text_ids, 
        max_length= 512,  
        num_return_sequences= n_samples,
        no_repeat_ngram_size= 2,
        repetition_penalty= 1.5,
        top_p= 0.92,
        temperature= .85,
        do_sample= True,
        top_k= 125,
        early_stopping= True
    )
    gen_text = []
    for t in generated_text_samples:
        text = tokenizer.decode(t, skip_special_tokens=True)
        gen_text.append(text)

        return gen_text


In [None]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)

In [None]:
# trained model loading
model_path = './email_v1'

my_model = GPT2LMHeadModel.from_pretrained(model_path)
my_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

bos = my_tokenizer.bos_token
eos = my_tokenizer.eos_token
sep = my_tokenizer.sep_token

emails = {}
# prompts=[
#     "Hello! I hope this email finds you well. I am reaching out to seek permission for a university-sponsored travel opportunity. The trip is related to [purpose] and I believe it would greatly contribute to my academic and personal growth. I kindly request your approval for this travel. Best regards.",
#     "Good morning, sir/ma'am! I am writing to inform you about a situation where certain resources in the library have been found spoilt or damaged. I kindly request your intervention in addressing this issue and ensuring that the necessary repairs or replacements are made. Thank you for your attention to this matter. Regards.",
#     "Hello! I hope you're having a productive day. I am reaching out to highlight a lack of resources in [specific area/department]. This shortage is hindering our ability to effectively carry out academic activities. I kindly request your assistance in addressing this issue and providing the necessary resources. Best regards.",
#     "Good morning, sir/ma'am! I am writing to seek permission for [specific activity/event] that is essential for my academic progress. I kindly request your approval for this activity as it would greatly contribute to my learning experience. Thank you for your understanding and support. Regards.",
#     "Hello! I hope this email finds you well. I am reaching out to bring to your attention the need for additional resources in the [specific area/department]. The current shortage is adversely affecting our ability to perform our duties. I kindly request your prompt action in resolving this matter. Best regards.",
#     "Good morning, sir/ma'am! I am writing to request permission to use a university facility for [specific purpose/activity]. I believe this activity is important for the student body and would greatly benefit the community. I kindly request your approval for this request. Thank you. Regards.",
#     "Hello! I hope you're having a wonderful day. I am reaching out to report the lack of proper equipment in [specific area/department]. This shortage is impeding our ability to conduct practical sessions effectively. I kindly request your attention to this matter and prompt action in providing the necessary equipment. Best regards.",
#     "Hello! Can you guide me on the textbooks I should obtain for the course 'Artificial Intelligence'? Best regards.",
#     "Good morning, sir/ma'am! I'm curious to know about the textbooks required for the course 'Introduction to Sociology.' Can you assist me? Cheers!",
#     "Dear Sir/Ma'am, could you please provide me with the information on the textbooks required for the course 'Introduction to Sociology'? Kind regards.",
#     "Hello there! Can you inform me about the mandatory textbooks for the course 'Introduction to Sociology'? Best wishes.",
#     "Good morning, sir/ma'am! I need information on the textbooks I must have for the course 'Introduction to Sociology.' Can you help me with that? Regards.",
#     "My dear, how far everything? Don't play too much.",
#     "Hello, I am a freelance writer interested in contributing to your magazine. Can you please provide me with the submission guidelines and any specific topics of interest? Thank you. Best regards.",
# ]
prompts=[
    "Could you please do me a favor ? I would like to read your current title policy to see what it says about easements . You should have received a copy during your closing . ",
    "How are you doing? I have been trying to reach you. What's up?",
]

for p in prompts:

    prompt = ' '.join([bos, p,  sep])
    content = generate_n_text_samples(my_model, my_tokenizer, prompt, 
                                      device, n_samples = 1)[0]
    emails[p] = content.replace(p, '')

for prompt, response in emails.items():
    print('\033[1m' + prompt + '\033[0m')
    pretty_print(response)
    print()

In [None]:

emails = {}
prompts = sampled_df['Received']

for p in prompts:
    prompt = ' '.join([bos, p, sep])
    content = generate_n_text_samples(my_model, my_tokenizer, prompt, device, n_samples=1)[0]
    emails[p] = content.replace(p, '')

# Create a new column 'ai_response' in 'sampled_df' with the generated responses
sampled_df['ai_response'] = sampled_df['Received'].map(emails)

# Calculate BLEU score or perform further evaluation using 'sampled_df['ai_response']

# Print the prompts and generated responses
for prompt, response in emails.items():
    print('\033[1m' + prompt + '\033[0m')
    pretty_print(response)
    print()

In [None]:
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(sampled_df)):
  reference = sampled_df['Response'][i]
  candidate = sampled_df['ai_response'][i]
  scores.append(sentence_bleu(reference, candidate))

print(statistics.mean(scores))

In [None]:
!zip -r file.zip /kaggle/working/

In [None]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = os.curdir, file_name = 'file.zip'):
    """
    zip all the files in a directory
    
    Parameters
    _____
    directory: str
        directory needs to be zipped, defualt is current working directory
        
    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'
        
    Returns
    _____
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

In [None]:
zip_dir()