In [1]:
# Load Libraries and packages
import numpy as np
import pandas as pd

# Parsing Tools for Summarizers
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

# Extractive Text Summarizer Libraries
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer

# Abstractive Text Summarizers
## T5 Models
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration
## BART Model
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
## GPT-2 Model
from transformers import GPT2Tokenizer,GPT2LMHeadModel

In [2]:
col_names = ['Document_No', 'Dominant_Topic', 'Topic_Keywords', 'Text']
data = pd.read_csv('data/top_dominant_results.csv',usecols=col_names)
data.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Keywords,Text
0,0,9.0,"car, call, company, tell, work, would, say, da...",Giving this location a low rating only because...
1,1,3.0,"order, wait, ask, say, minute, table, take, te...",I'll start off by saying that this was my favo...
2,2,3.0,"order, wait, ask, say, minute, table, take, te...",The mille crepe cake my sister gifted me for m...
3,3,7.0,"pizza, restaurant, love, service, bar, order, ...",Go someplace else there are better hotels in t...
4,4,9.0,"car, call, company, tell, work, would, say, da...",Used them a number of years ago and they were ...


In [3]:
print(f'Number of Rows in Dataframe: {len(data)}\n')
print(data.isnull().sum())

Number of Rows in Dataframe: 335433

Document_No         0
Dominant_Topic    143
Topic_Keywords    143
Text                0
dtype: int64


In [4]:
data.dropna(subset=['Dominant_Topic','Topic_Keywords'], inplace=True)
print(f'Number of Rows in Dataframe: {len(data)}\n')
data['Dominant_Topic'] = data['Dominant_Topic'].astype(int)
data.head()

Number of Rows in Dataframe: 335290



Unnamed: 0,Document_No,Dominant_Topic,Topic_Keywords,Text
0,0,9,"car, call, company, tell, work, would, say, da...",Giving this location a low rating only because...
1,1,3,"order, wait, ask, say, minute, table, take, te...",I'll start off by saying that this was my favo...
2,2,3,"order, wait, ask, say, minute, table, take, te...",The mille crepe cake my sister gifted me for m...
3,3,7,"pizza, restaurant, love, service, bar, order, ...",Go someplace else there are better hotels in t...
4,4,9,"car, call, company, tell, work, would, say, da...",Used them a number of years ago and they were ...


Data currently stored in Dataframe format, however Summari

In [5]:
# group text by topic number
grouped_text = data.groupby(['Dominant_Topic'], as_index = False).agg({'Text': '.'.join})

In [6]:
# Function to remove linebreaks from compiled text data
def remove_linebreaks(text):
    cleaned_string = '.'.join(text.splitlines())
    return cleaned_string

In [7]:
# Reformat new dataframe with grouped, cleaned text data through lambda application
grouped_text['Text'] = grouped_text.apply(lambda row : remove_linebreaks(row['Text']),axis = 1)

In [8]:
# Function to Save formatted data to Text format
def store_as_txt(groupby_column, target_column, file_location):
    ## Loop through dataframe by Topic Number
    for i in groupby_column:
        ## Store text data as local variable
        item = target_column.loc[groupby_column == i].item()
        ## Create unique text_doc for each topic
        with open(f"{file_location}{i}.txt", "w", encoding="utf-8") as text_file:
            text_file.write(item)
        print(f"Text Document {i} Complete")

In [9]:
# Designate Textfile-Save Directory Location
file_loc = "data/Text_Gen_Files/Clean_Text_Topic_"
# Execute Save Function
store_as_txt(grouped_text['Dominant_Topic'], grouped_text['Text'],file_loc)

Text Document 0 Complete
Text Document 1 Complete
Text Document 2 Complete
Text Document 3 Complete
Text Document 4 Complete
Text Document 5 Complete
Text Document 6 Complete
Text Document 7 Complete
Text Document 8 Complete
Text Document 9 Complete
Text Document 10 Complete
Text Document 11 Complete
Text Document 12 Complete
Text Document 13 Complete
Text Document 14 Complete
Text Document 18 Complete


In [10]:
# Load txt file as compiled string variable
def load_txt_file(file_loc,filename):
    with open(f"{file_loc}{filename}.txt","r",encoding="utf-8") as text_file:
        contents = text_file.read()
    return contents

In [13]:
file_dir = "data/Text_Gen_Files/"
filename = "Clean_Text_Topic_0"
topic_1_txt = load_txt_file(file_dir,filename)
# topic_1_txt

## Extractive Summarizers to be evaluated:
1) LexRank
2) LSA
3) Luhn 
4) KL

In [None]:
# Initialize Text Parser and Tokenizer for string variable as input
text_parser = PlaintextParser.from_string(topic_1_txt, Tokenizer('english'))

### 1. LexRank Summarizer

In [None]:
# Initialize LexRank Summarizer model
lex_rank_summarizer = LexRankSummarizer()
lexrank_summary = lex_rank_summarizer(text_parser.document, sentences_count=10)

# Print Summarized Text
for sentence in lexrank_summary:
    print(sentence)

### 2. LSA Summarizer

In [None]:
lsa_summarizer = LsaSummarizer()
lsa_summary = lsa_summarizer(text_parser.document,sentences_count=10)

# Printing the summary
for sentence in lsa_summary:
    print(sentence)

### 3. Luhn Summarizer

In [None]:
luhn_summarizer = LuhnSummarizer()
luhn_summary = luhn_summarizer(text_parser.document,sentences_count=10)

# Printing the summary
for sentence in luhn_summary:
    print(sentence)

### 4. KL Summarizer

In [None]:
kl_summarizer = KLSummarizer()
kl_summary = kl_summarizer(text_parser.document,sentences_count=10)

# Printing the summary
for sentence in kl_summary:
    print(sentence)

## Abstractive Sumamrizer to be evaluated:

1) T5 Transformer
2) BART Model
3) GPT-2 Model