In [31]:
import pandas as pd
import numpy as np
import ast
import re
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [32]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [33]:
with open(r"C:\Users\saras\Downloads\booksummaries.txt", encoding="utf8") as file:
    data = file.readlines()

In [34]:
dataset = []
for line in data:    
    parts = line.split('\t')
    book_id = parts[0]
    name = parts[2]
    author = parts[3]
    publish_year = parts[4]
    genres = parts[5]
    summary = parts[6]
    books = {"id":book_id, "name":name, "author":author, "publish year":publish_year, "genres":genres, "summary":summary}
    dataset.append(books)
    
df = pd.DataFrame(dataset)

In [35]:
df_output = df[["name","summary"]]

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16559 entries, 0 to 16558
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            16559 non-null  object
 1   name          16559 non-null  object
 2   author        16559 non-null  object
 3   publish year  16559 non-null  object
 4   genres        16559 non-null  object
 5   summary       16559 non-null  object
dtypes: object(6)
memory usage: 776.3+ KB


In [37]:
df.head(5)

Unnamed: 0,id,name,author,publish year,genres,summary
0,620,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,986,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,1756,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...
4,2080,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


In [38]:
df.replace("", np.NaN, inplace=True)

In [39]:
df['genres'] = df['genres'].apply(lambda x: list(ast.literal_eval(x).values()) if pd.notna(x) else np.NaN)
df['genres'] = df['genres'].apply(lambda x: ", ".join(x) if isinstance(x, list) else np.NaN)

In [40]:
df["publish year"] = df["publish year"].str[:4]

In [41]:
df.isnull().sum()

id                 0
name               0
author          2382
publish year    5610
genres          3718
summary            0
dtype: int64

In [42]:
df.duplicated().sum()

0

In [43]:
df["name"].mode()[0]

'Nemesis'

In [44]:
df["author"].mode()[0]

'Agatha Christie'

In [45]:
df["publish year"].mode()[0]

'2007'

In [46]:
df["publish year"].fillna(df["publish year"].mode()[0], inplace=True)

In [47]:
df = df.astype({'id': 'int', 'publish year': 'int'})

In [48]:
df.head()

Unnamed: 0,id,name,author,publish year,genres,summary
0,620,Animal Farm,George Orwell,1945,"Roman à clef, Satire, Children's literature, S...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,Anthony Burgess,1962,"Science Fiction, Novella, Speculative fiction,...","Alex, a teenager living in near-future Englan..."
2,986,The Plague,Albert Camus,1947,"Existentialism, Fiction, Absurdist fiction, Novel",The text of The Plague is divided into five p...
3,1756,An Enquiry Concerning Human Understanding,David Hume,2007,,The argument of the Enquiry proceeds by a ser...
4,2080,A Fire Upon the Deep,Vernor Vinge,2007,"Hard science fiction, Science Fiction, Specula...",The novel posits that space around the Milky ...


### Cleaning functions:

In [49]:
def lower_case(text):
    return df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [50]:
def remove_numbers(text):
    return df.applymap(lambda x: re.sub(r'\d+', '', x) if isinstance(x, str) else x)

In [51]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return df.applymap(lambda x: x.translate(translator) if isinstance(x, str) else x)

In [52]:
def remove_whitespace(text):
    return df.applymap(lambda x: " ".join(x.split()) if isinstance(x, str) else x)

In [53]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [54]:
def tokenization(df):
    return df.applymap(lambda x: word_tokenize(x) if isinstance(x, str) else x)

In [55]:
def stemming(text):
    singles = [stemmer.stem(word) for word in text]
    return singles

In [56]:
def lemmatizing(text):
    lemm_text = [lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [57]:
textual_columns = df.select_dtypes(include=['object']).columns
df_text = df[textual_columns]

### Cleaning df_text:

In [58]:
df_text = lower_case(df_text)
df_text = remove_numbers(df_text) # is it necessary?
df_text = remove_punctuation(df_text)
df_text = remove_whitespace(df_text)
df_text = df_text.applymap(lambda x: remove_stopwords(x) if isinstance(x, str) else x)
df_text = tokenization(df_text)
df_text = df_text.applymap(lambda x: stemming(x) if isinstance(x, list) else [])

In [59]:
def summarizer(text):
    freqTable = dict() 
    for word in text:
        word = word.lower() 
        stopWords = set(stopwords.words('english'))
        if word in stopWords: 
            continue
        if word in freqTable: 
            freqTable[word] += 1
        else: 
            freqTable[word] = 1
   
    # Creating a dictionary to keep the score of each sentence 
    sentences = sent_tokenize(text) 
    sentenceValue = dict() 
   
    for sentence in sentences: 
        for word, freq in freqTable.items(): 
            if word in sentence.lower(): 
                if sentence in sentenceValue: 
                    sentenceValue[sentence] += freq 
                else: 
                    sentenceValue[sentence] = freq 
   
    sumValues = 0
    for sentence in sentenceValue: 
        sumValues += sentenceValue[sentence] 
   
    # Average value of a sentence from the original text 
   
    average = int(sumValues / len(sentenceValue)) 
   
    # Storing sentences into our summary. 
    summary = ''
    coef = 1.2        
    while (summary == '') and (coef >= 1):
        for sentence in sentences: 
            if (sentence in sentenceValue) and (sentenceValue[sentence] > (coef * average)): 
                summary += " " + sentence
        coef -= 0.01
    
    return summary

### summarizing the books' summary:

In [None]:
df_output["condensed summaries"] = df_output["summary"].apply(lambda x: summarizer(x))

## Conclusion from the EDA on Text Data:
### 1- The most repeated book name is : "Nemesis".
### 2- Agatha Christie is the author with the most published books.
### 3- Most of the books were published in 2007.