# Importing necessary libraries

In [10]:
import numpy as np  
import pandas as pd 
import re           
from keras.preprocessing.text import Tokenizer 
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

In [11]:
!pip install transformers





# Reading in the data 

In [12]:
df = pd.read_csv('Books_rating.csv')
df.head(4)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,"This is only for Julie Strain fans. It's a collection of her photos -- about 80 pages worth with a nice section of paintings by Olivia.If you're looking for heavy literary content, this isn't the ..."
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after reading Philip Nel's book I changed my mind--that's a good testimonial to the power of Rel's writing and thinking. Rel plays Dr. Seuss the ultimate compli...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""the child is father to the man,"" then Dr. Seuss (Theodor Seuss Geisel) is the most influential author, poet, and artist of modern times. For me, a dadd..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;Dr. Seuss,&quot; was one of the most influential writers and artists of the 20th century.In 1959, Rudolf Flesch wrote, &quot;A hundred years from now, ..."


In [13]:
df.columns
df_1 = df.copy()
df_2 = df_1[["review/text","review/summary"]]

## Data info and description

In [14]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   review/text     object
 1   review/summary  object
dtypes: object(2)
memory usage: 45.8+ MB


In [15]:
df_2.describe()

Unnamed: 0,review/text,review/summary
count,2999992,2999962
unique,2062648,1592315
top,digital books are perfect and easy to use! They take up no space on the bookshelf and are always with you!!!,Great Book
freq,322,6848


# Renaming the two necessary columns

In [16]:
df_2.rename(columns={"review/text":"text", "review/summary":"summary"}, inplace=True)
df_2

Unnamed: 0,text,summary
0,"This is only for Julie Strain fans. It's a collection of her photos -- about 80 pages worth with a nice section of paintings by Olivia.If you're looking for heavy literary content, this isn't the ...",Nice collection of Julie Strain images
1,I don't care much for Dr. Seuss but after reading Philip Nel's book I changed my mind--that's a good testimonial to the power of Rel's writing and thinking. Rel plays Dr. Seuss the ultimate compli...,Really Enjoyed It
2,"If people become the books they read and if ""the child is father to the man,"" then Dr. Seuss (Theodor Seuss Geisel) is the most influential author, poet, and artist of modern times. For me, a dadd...",Essential for every personal and Public Library
3,"Theodore Seuss Geisel (1904-1991), aka &quot;Dr. Seuss,&quot; was one of the most influential writers and artists of the 20th century.In 1959, Rudolf Flesch wrote, &quot;A hundred years from now, ...",Phlip Nel gives silly Seuss a serious treatment
4,"Philip Nel - Dr. Seuss: American IconThis is basically an academic overview of Seuss poetry, art, cartoons, and the problems with the commercialization of the Seuss name and works after his death....",Good academic overview
...,...,...
2999995,"This is an extremely difficult book to digest, and it is not for casual readers. However, Collingwood's ideas on a meeting of minds between past and present is absolutely fascinating and gets to t...",Difficult
2999996,This is pretty interesting. Collingwood seems like on of the first historians to really utilize ideas from evolutionary theory and modern psychology in his overall method. He manages to create a v...,Quite good and ahead of its time occasionally
2999997,"This is a good book but very esoteric. ""What is History?"" by E.H. Carr is an easier selection for the causal reader or someone beginning to study historiography.",Easier reads of those not well versed in historiography
2999998,"My daughter, a freshman at Indiana University, e-mailed me a list of the books she needed. This was on it... I ordered it, paid for it, and had it shipped directly to her. It arrived sooner than e...","Yes, it is cheaper than the University Bookstore"


In [17]:
df_2.dropna(how='any', inplace=True)

# Limiting the rows of the data to 3000

### This eradicates longer runtime.

In [18]:
df_3 = df_2[:100000]

# Installing necessary libraries

In [19]:
!pip install datasets nltk rouge_score





# Rouge score calculation function

In [20]:
from datasets import load_metric
metric = load_metric("rouge")

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}
    return result

# creating a BASELINE Model and calculating the Rouge score

In [21]:
import re

ref_summaries = list(df_3['summary'])

for i in range (3):
    candidate_summaries = list(df_3['text'].apply(lambda x: ' '.join(re.split(r'(?<=[.:;])\s', x)[:i+1])))
    print(f"First {i+1} senctences: Scores {calc_rouge_scores(candidate_summaries, ref_summaries)}")

First 1 senctences: Scores {'rouge1': 10.8, 'rouge2': 3.1, 'rougeL': 9.9, 'rougeLsum': 9.9}
First 2 senctences: Scores {'rouge1': 9.0, 'rouge2': 2.4, 'rougeL': 8.0, 'rougeLsum': 8.0}
First 3 senctences: Scores {'rouge1': 7.7, 'rouge2': 2.0, 'rougeL': 6.7, 'rougeLsum': 6.7}


In [22]:
from transformers import pipeline 

summarizer = pipeline("summarization")
print(summarizer(df_3['text'][0]))
print(summarizer.model.config.__getattribute__('_name_or_path'))
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

No model was supplied, defaulted to t5-small and revision d769bba (https://huggingface.co/t5-small).
Using a pipeline without specifying a model name and revision in production is not recommended.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Your max_length is set to 200, but you input_length is only 121. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


[{'summary_text': "this is only for Julie Strain fans . there's only about 2 pages with text and everything else is photos . if you like Julie like I like Julie, you won't go wrong ."}]
t5-small


All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


# Text Preprocessing

###     - Removing extra white spaces
###     - Expand contractions
###     - Remove special case characters
###     - Lowercasing all letters

### Dictionary to be used for expanding contractions

In [30]:

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [31]:
# for text
stop_words = set(stopwords.words('english')) 
def text_cleaner(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = [w for w in newString.split() if not w in stop_words]
    long_words=[]
    for i in tokens:
        if len(i)>=3:                  #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()

cleaned_text = []
for t in df_3['text']:
    cleaned_text.append(text_cleaner(t))

In [33]:
# for summary
def summary_cleaner(text):
    newString = re.sub('"','', text)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    newString = newString.lower()
    tokens=newString.split()
    newString=''
    for i in tokens:
        if len(i)>1:                                 
            newString=newString+i+' '  
    return newString

#Call the above function
cleaned_summary = []
for t in df_3['summary']:
    cleaned_summary.append(summary_cleaner(t))

df_3['cleaned_text']=cleaned_text
df_3['cleaned_summary']=cleaned_summary
df_3['cleaned_summary'].replace('', np.nan, inplace=True)
df_3.dropna(axis=0,inplace=True)

### Adding tokens to the start and end of the summary column

In [35]:
df_3['cleaned_summary'] = df_3['cleaned_summary'].apply(lambda x : '_START_ '+ x + ' _END_')

In [36]:
for i in range(5):
    print("Review:",df_3['cleaned_text'][i])
    print("Summary:",df_3['cleaned_summary'][i])
    print("\n")

Review: julie strain fans collection photos pages worth nice section paintings olivia looking heavy literary content place find pages text everything else photos bottom line want one book six foot one probably better choice however like julie like like julie wrong one either
Summary: _START_ sostoknice collection of julie strain imageseostok  _END_


Review: care much seuss reading philip nel book changed mind good testimonial power rel writing thinking rel plays seuss ultimate compliment treating serious poet well one century interesting visual artists reading book decided trip mandeville collections library university california san diego order could visit incredible seuss geisel holdings almost much take like william butler yeats seuss led career constantly shifted metamoprhized meet new historical political cirsumstances seems leftist conservative different junctures career politics art nel shows cartoonist fabled magazine like andy warhol served time slaving business service amusi

# Getting the percentile values of the word count of text.

In [24]:
df_3['word count text'] = df_3['text'].apply(lambda x : len(str(x).split()))
for i in range(0,100,10):
    var = df_3['word count text'].values
    var = np.sort(var, axis=None)
    print('{} percentile value is {}'.format(i, var[int(len(var)*(float(i)/100))]))
print('100 percentile value is ', var[-1])



print(' FROM 90 TO 100')


for i in range(90,100):
    var = df_3['word count text'].values
    var = np.sort(var, axis=None)
    print('{} percentile value is {}'.format(i, var[int(len(var)*(float(i)/100))]))
print('100 percentile value is ', var[-1])

0 percentile value is 1
10 percentile value is 27
20 percentile value is 41
30 percentile value is 57
40 percentile value is 74
50 percentile value is 95
60 percentile value is 121
70 percentile value is 157
80 percentile value is 213
90 percentile value is 323
100 percentile value is  4928
 FROM 90 TO 100
90 percentile value is 323
91 percentile value is 341
92 percentile value is 363
93 percentile value is 389
94 percentile value is 419
95 percentile value is 455
96 percentile value is 499
97 percentile value is 554
98 percentile value is 640
99 percentile value is 793
100 percentile value is  4928


### For summary

In [25]:
df_3['word count summary'] = df_3['summary'].apply(lambda x : len(str(x).split()))
for i in range(0,100,10):
    var = df_3['word count summary'].values
    var = np.sort(var, axis=None)
    print('{} percentile value is {}'.format(i, var[int(len(var)*(float(i)/100))]))
print('100 percentile value is ', var[-1])



print(' FROM 90 TO 100')


for i in range(90,100):
    var = df_3['word count summary'].values
    var = np.sort(var, axis=None)
    print('{} percentile value is {}'.format(i, var[int(len(var)*(float(i)/100))]))
print('100 percentile value is ', var[-1])

0 percentile value is 1
10 percentile value is 2
20 percentile value is 2
30 percentile value is 3
40 percentile value is 4
50 percentile value is 4
60 percentile value is 5
70 percentile value is 6
80 percentile value is 7
90 percentile value is 9
100 percentile value is  29
 FROM 90 TO 100
90 percentile value is 9
91 percentile value is 9
92 percentile value is 9
93 percentile value is 9
94 percentile value is 10
95 percentile value is 10
96 percentile value is 10
97 percentile value is 11
98 percentile value is 12
99 percentile value is 13
100 percentile value is  29


In [26]:
# Setting the maximum for both text and summary

max_count_text = 1500
max_count_summary = 15

In [27]:
# Splitting the dataset with train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_3['text'], df_3['summary'], test_size=0.3, random_state=23)

## Using the GloVe embedding 

#### Global Vectors for Word Representation

In [28]:
#Loading our Glove Model 
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding='utf8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

#Calculating Percentage of words from train text present in Word2vec model
words_source_train = []
for i in x_train :
  words_source_train.extend(i.split(' '))
## Find the total number of words in the Train data of Essays.
print("all the words in the corpus", len(words_source_train))
## Find the unique words in this set of words
words_source_train = set(words_source_train)
print("the unique words in the corpus", len(words_source_train))
## Find the words present in both Glove Vectors as well as our corpus.
inter_words = set(embeddings_index.keys()).intersection(words_source_train)
print("The number of words that are present in both glove vectors and our corpus are {} which \
is nearly {}% ".format(len(inter_words), np.round((float(len(inter_words))/len(words_source_train))
*100)))
words_corpus_source_train = {}
words_glove = set(embeddings_index.keys())
for i in words_source_train:
  if i in words_glove:
    words_corpus_source_train[i] = embeddings_index[i]
print("word 2 vec length", len(words_corpus_source_train))

Loaded 400000 word vectors.
all the words in the corpus 10266079
the unique words in the corpus 409026
The number of words that are present in both glove vectors and our corpus are 52381 which is nearly 13.0% 
word 2 vec length 52381


## Tokenization

#### _Changing_ and _Passing_ the text values into a more understandable format for the model
#### Tokenizing both _text_ and _summary_ columns

In [29]:
word_index = x_tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

NameError: name 'x_tokenizer' is not defined