## **facebook/bart-large-cnn** ##

- Summarisation task

In [3]:
import pandas as pd

df = pd.read_csv('../../data/data.csv')

df.head()

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpful...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper


In [4]:
max_length = 0
max_length_id = None

for conversation_id in range(1, 101):
    res = " ".join(df[df["conversation_id"] == conversation_id]["message"])
    s = res.split()
    length = len(s)
    if length > max_length:
        max_length = length
        max_length_id = conversation_id


print(f"Max Length: {max_length}")
print(f"Max Length ID: {max_length_id}")

Max Length: 937
Max Length ID: 73


## **Concatenating individual messages to form the entire conversation** ##

In [26]:
from pprint import pprint
conversation_id = 72
res = " ".join(df[df["conversation_id"] == conversation_id]["message"])
s = res.split()
pprint(res)
print(type(res))
print(len(s))

('Government is fascinating. It seems so different the world over yet there '
 'are common threads in every type.  I agree. Some governments try to help, '
 'others, not so much. So true. I wonder what america would look like if we '
 'had a PM instead of a president. A PM seems to be held more accountable for '
 "their actions. True. Much easier to remove. Australia's PM position isn't in "
 'the constitution and exists only through custom. That seems strange to me. '
 "Not in the Constitution? Australia's constitution is weird though. It "
 "doesn't protect the people as much like an american one does. I did not know "
 'that. Number 10 Downing Street has a cat with a title! I wonder how many '
 'other government headquarters have animals like that. With all the historic '
 'buildings in the UK I could see why they need one. Older structures like '
 'that tend to attract rodents. Is one cat enough though? Might need a support '
 'staff. Right? lol. I wonder if you can meet the cat if

In [3]:
import json, os
import pandas as pd
from pathlib import Path
from typing import Optional, Dict
from transformers import pipeline

input = Path('../../data/data.csv')
output = Path('../../summary/bart-summary.json')

# Runs the highlevel pipeline to load the tokenizer and model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_conversation(conversation_id: int, filepath: Optional[Path]=input) -> Dict:
    """
        ARGS:
            conversation_id: int
            filepath: Path
        RETURN:
            hashmap: Dict
        
        Summarize a conversation given a conversation_id and store in a hashmap
        Returns exceeed max token limit if the paragraph exceeds the max token limit in the Exception
    """
    df: pd.DataFrame 
    paragraph: str
    hashmap: Dict = {
        "conversation_id": conversation_id,
        "message": None,
        "summary": None
    }

    if not filepath.exists():
        raise FileNotFoundError(f"File {filepath} does not exist")

    
    df = pd.read_csv(filepath)
    paragraph = " ".join(df[df["conversation_id"] == conversation_id]["message"])

    hashmap["conversation_id"] = conversation_id

    try:
        hashmap["summary"] = summarizer(paragraph, max_length=50, min_length=30, do_sample=False, num_beams=4)[0]['summary_text']
        
    except Exception as e:
        hashmap["summary"] = "Exceed max token limit."

    hashmap['message'] = paragraph

    return hashmap


def save_to_json(data: Dict, resultpath: Optional[Path]=output) -> None:
    """
        ARGS:
            data: Dict
            resultpath: Path
        RETURN:
            None
        
        Save data to a json file
    """

    if not resultpath.exists():
        if not os.path.exists(os.path.dirname(resultpath)):
            os.makedirs(os.path.dirname(resultpath))
        res = []
    else:
        with open(file=resultpath, mode="r") as f:
            res = json.load(f)

    res.append(data)

    with open(file=resultpath, mode="w") as f:
        json.dump(res, f, indent=4)


if __name__ == "__main__":
    for id in range(1, 101):
        data = summarize_conversation(id)
        save_to_json(data)
            
    print("Done")

Done
