In [1]:
# Load Libraries and packages
import numpy as np
import pandas as pd

# Extractive Text Summarizer Libraries
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.kl import KLSummarizer

# Abstractive Text Summarizers
## T5 Models
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration
## BART Model
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
## GPT-2 Model
from transformers import GPT2Tokenizer,GPT2LMHeadModel

In [2]:
col_names = ['Document_No', 'Dominant_Topic', 'Topic_Keywords', 'Text']
data = pd.read_csv('data/top_dominant_results.csv',usecols=col_names)
data.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Keywords,Text
0,0,9.0,"car, call, company, tell, work, would, say, da...",Giving this location a low rating only because...
1,1,3.0,"order, wait, ask, say, minute, table, take, te...",I'll start off by saying that this was my favo...
2,2,3.0,"order, wait, ask, say, minute, table, take, te...",The mille crepe cake my sister gifted me for m...
3,3,7.0,"pizza, restaurant, love, service, bar, order, ...",Go someplace else there are better hotels in t...
4,4,9.0,"car, call, company, tell, work, would, say, da...",Used them a number of years ago and they were ...


In [3]:
print(f'Number of Rows in Dataframe: {len(data)}\n')
print(data.isnull().sum())

Number of Rows in Dataframe: 335433

Document_No         0
Dominant_Topic    143
Topic_Keywords    143
Text                0
dtype: int64


In [4]:
data.dropna(subset=['Dominant_Topic','Topic_Keywords'], inplace=True)
print(f'Number of Rows in Dataframe: {len(data)}\n')

Number of Rows in Dataframe: 335290



In [5]:
data['Dominant_Topic'] = data['Dominant_Topic'].astype(int)
data.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Keywords,Text
0,0,9,"car, call, company, tell, work, would, say, da...",Giving this location a low rating only because...
1,1,3,"order, wait, ask, say, minute, table, take, te...",I'll start off by saying that this was my favo...
2,2,3,"order, wait, ask, say, minute, table, take, te...",The mille crepe cake my sister gifted me for m...
3,3,7,"pizza, restaurant, love, service, bar, order, ...",Go someplace else there are better hotels in t...
4,4,9,"car, call, company, tell, work, would, say, da...",Used them a number of years ago and they were ...


In [6]:
grouped_text = data.groupby(['Dominant_Topic'], as_index = False).agg({'Text': '.'.join})

In [10]:
grouped_text

Unnamed: 0,Dominant_Topic,Text
0,0,Statistical Tests for Optimization Efficiency\...
1,1,I had an appointment for a consultation/second...
2,2,The pork meatballs are the best flavour- the o...
3,3,I'll start off by saying that this was my favo...
4,4,unbelieeeevable. \n\nout of all the many time...
5,5,I had an amazing blowout with Hannah. She did ...
6,6,It's located on 7th and Mill next to the Steak...
7,7,Go someplace else there are better hotels in t...
8,8,Best Yoga Studio in all of Phoenix. Spin clas...
9,9,Giving this location a low rating only because...


In [7]:
# Function to remove linebreaks from compiled text data
def remove_linebreaks(text):
    cleaned_string = ' '.join(text.splitlines())
    return cleaned_string

In [11]:
# Reformat grouped dataframe with cleaned text data by applying cleaning to each row
grouped_text['Text'] = grouped_text.apply(lambda row : remove_linebreaks(row['Text']),axis = 1)

In [47]:
# Save formatted data to Text format
## Loop through dataframe by Topic Number
for i in grouped_text['Dominant_Topic']:
    ## Store text data as local variable
    item = grouped_text['Text'].loc[grouped_text['Dominant_Topic'] == i].item()
    ## Create unique text_doc for each topic
    filename = f"data/Text_Gen_Files/Clean_Text_Topic_{i}"
    with open(filename, "w", encoding="utf-8") as text_file:
        text_file.write(item)
        print(f"Text Document {i} Complete")

Text Document 0 Complete
Text Document 1 Complete
Text Document 2 Complete
Text Document 3 Complete
Text Document 4 Complete
Text Document 5 Complete
Text Document 6 Complete
Text Document 7 Complete
Text Document 8 Complete
Text Document 9 Complete
Text Document 10 Complete
Text Document 11 Complete
Text Document 12 Complete
Text Document 13 Complete
Text Document 14 Complete
Text Document 18 Complete
