# Table of Contents
* [How many volumns and chapters are there?](#chapter1)
    * [Extracting the volumns](#section_1_1)
    * [Extracting the chapters](#section1_2)
* [Basic EDA](#chapter2)
    * [Occurrence of sentences](#section2_1)
    * [Occurrence of words](#section_2_2)
        * [Sub Section 2.1.1](#sub_section_2_1_1)
        * [Sub Section 2.1.2](#sub_section_2_1_2)
* [Chapter 3](#chapter3)
    * [Section 3.1](#section_3_1)
        * [Sub Section 3.1.1](#sub_section_3_1_1)
        * [Sub Section 3.1.2](#sub_section_3_1_2)
    * [Section 3.2](#section_3_2)
        * [Sub Section 3.2.1](#sub_section_3_2_1)

# How many volumns are there ?<a class="anchor" id="chapter1"></a>

In [1]:
# Necessary libraries
import re
from pathlib import Path 
import json
from processtext import clean, remove_sw, lemmatize, clean_text
PATH = str(Path.cwd().parent)

# Reading the dataset
with open(PATH + "/data/raw/Complete Works of Swami Vivekananda -  All Volumes - Swami Vivekananda.txt", "r") as file:
    book = file.read()

[nltk_data] Downloading package stopwords to /home/ujjwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ujjwal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ujjwal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Extracting the volumns inside the book<a class="anchor" id="section_1_1"></a>

In [2]:
pattern = re.compile("Volume [0-9]+") # (Volumn [a-z])
volumns = re.findall(pattern, book)
volumns

['Volume 1',
 'Volume 2',
 'Volume 3',
 'Volume 4',
 'Volume 5',
 'Volume 6',
 'Volume 7',
 'Volume 8',
 'Volume 9',
 'Volume 9',
 'Volume 9']

## Extracting the chapters<a class="anchor" id="section_1_2"></a>

In [3]:
def search_patterns(text:str,
                    pattern)->list:
    """This function searches the whole text file for the given pattern

    Args:
        text (str): Input text
        pattern (str): Pattern in regex format

    Returns:
        list: list of search results
    """
    forward_pointer = 0
    output_results_list = []
    while (match := re.search(pattern, text[forward_pointer:])) is not None:
        forward_pointer += match.end()
        output_results_list.append(match.group().replace('\n', '').rstrip())
        # print(f"Forward Pointer: {forward_pointer} \n _____________________ text:",match.group().replace('\n', '').rstrip())
    return list(sorted(set(output_results_list)))

list_of_chapters = search_patterns(book,
                                   pattern= r'\b\d\.\d[ \.\d\w\s][ \.\d\w\s][ \.\d\w\s][ \.\d\w\s][A-Za-z0-9_]*.?(\n|.*)') # https://regexr.com/
list_of_chapters.remove('0.0032 millimeter.')
list_of_chapters

['1.1 ADDRESSES AT THE PARLIAM',
 '1.1 ADDRESSES AT THE PARLIAMENT',
 '1.1. 1. (Sanskrit in ITRANS format)',
 '1.1.1 RESPONSE TO WELCOME',
 '1.1.2 WHY WE DISAGREE',
 '1.1.3 PAPER ON HINDUISM',
 '1.1.4 RELIGION NOT THE CRYING NEED OF INDIA',
 '1.1.5 BUDDHISM, THE FULFILMENT OF HINDUISM',
 '1.1.5 PAPER ON HINDUISM',
 '1.1.6 ADDRESS AT THE FINAL SESSION',
 '1.11.',
 '1.2 KARMA-YOGA',
 '1.2.1 CHAPTER 1: KARMA IN ITS EFFECT ON CHARACTER',
 '1.2.2 CHAPTER 2: EACH IS GREAT IN HIS OWN PLACE',
 '1.2.4 CHAPTER 4: WHAT IS DUTY?',
 '1.2.5 CHAPTER 3: THE SECRET OF WORK',
 '1.2.5 CHAPTER 5: WE HELP OURSELVES, NOT THE WORLD',
 '1.2.6 CHAPTER 6: NON-ATTACHMENT IS COMPLETE SELF-',
 '1.2.7 CHAPTER 7: FREED',
 '1.2.7 CHAPTER 7: FREEDOM',
 '1.2.8 CHAPTER 8: THE IDEAL OF K',
 '1.2.8 CHAPTER 8: THE IDEAL OF KARMA-YOGA',
 '1.2.9',
 '1.3 RAJA-YOGA',
 '1.3.0 PREFACE',
 '1.3.1 CHAPTER 1: INTRODUCTORY',
 '1.3.2 CHAPTER 2: THE FIRST STEPS',
 '1.3.3 CHAPTER 3: PRANA',
 '1.3.4 CHAPTER 4: THE PSYCHIC PRANA',
 '1.3.5

# Basic EDA<a class="anchor" id="chapter2"></a>

## Occurrence of sentences <a class="anchor" id="section_2_1"></a>

In [4]:
strating_sentence = "OUR MASTER AND HIS MESSAGE"
pattern = re.compile(strating_sentence)
starting_pointer = re.search(pattern, book).end()
print("Number of sentences: ", len(book[starting_pointer:].split(". ")))

Number of sentences:  79340


## Occurrence of words <a class="anchor" id="section_2_2"></a>

In [5]:
cleaned_text = clean(book.replace("’",""),extra_spaces= True, lowercase= True, punct= True,sw= True) # Basic preprocessing
cleaned_text = remove_sw(cleaned_text) # Removing stopwords
cleaned_text = cleaned_text.replace("—", '')
cleaned_text = clean_text(cleaned_text)

def remove_numerals_and_one_char_words(input_text):
    # Remove numerals
    text_without_numerals = re.sub(r'\b\d+\b', '', input_text)

    # Remove one-character words
    text_without_one_char_words = ' '.join(word for word in text_without_numerals.split() if len(word) > 1)

    # Remove extra spaces
    cleaned_text = ' '.join(text_without_one_char_words.split())

    return cleaned_text

cleaned_text = remove_numerals_and_one_char_words(cleaned_text)
cleaned_text = lemmatize(cleaned_text) # lemmatization

pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
findings[:5]
dict = {}
for word in findings:
    if word in dict.keys():
        dict[word] = dict[word] + 1
    else:
        dict[word] = 1
dict_list = [(value,key) for (key, value) in dict.items()]
sorted(dict_list, reverse=True)

In [17]:
def word_frequency_counter(text_:str, ascending = False)->dict:
    """This function counts the occurances of each words

    Args:
        text_ (str): String data
        ascending (bool, optional): Count sorting order. Defaults to False (indicates decending order).

    Returns:
        dict: Dictionary of unique words as keys and their counts as values
    """

    pattern = re.compile("[a-zA-Z]+")
    findings = re.findall(pattern, text_.lower())
    dict = {}
    for word in findings:
        if word in dict.keys():
            dict[word] = dict[word] + 1
        else:
            dict[word] = 1
    dict_list = [(value,key) for (key, value) in dict.items()]
    if ascending:
        return sorted(dict_list, reverse=False)
    else:
        return sorted(dict_list, reverse=True)

In [52]:
# # List of words we dont want to remove from the list of stop words
# list_of_words = []  # Add later
from nltk.corpus import stopwords
# # Creating a custom list of stopwords
# stopwords_ = set(stopwords.words('english'))
# s = set(list_of_words)
# custom_stopwords_ = list(stopwords_ - s)

def remove_sw(text: str,
              custom_stopwords = [],
              ignored_stopwords = [],
              default_language = 'english')-> str:
    """This function removes the stopwords

    Args:
        text (str): String object
        custom_stopwords (list, optional): List of custom stopwords. Defaults to [].
        ignored_stopwords (list, optional): List of stopwords to be ignored. Defaults to [].
        default_language (str, optional): Default language of the text. Defaults to 'english'.

    Returns:
        str: Modified text with removed stopwords
    """

    if (len(custom_stopwords) > 0) | (len(ignored_stopwords) > 0):
        stopwords_ = set(stopwords.words(default_language)).union(set([custom_word.lower() for custom_word in custom_stopwords]))
        stopwords_ = list(
            stopwords_ - set([ignored_words.lower() for ignored_words in ignored_stopwords])
        )
        words = [word for word in text.split() if word.lower() not in stopwords_]
        return " ".join(words)
    else:
        stopwords_ = stopwords.words(default_language)
        words = [word for word in text.split() if word.lower() not in stopwords_]
        return " ".join(words)
        

In [54]:
txt2 =  """This function removes the stopwords
    
    Args:
        text (str): String Object
        custom_stopwords (list, optional): List of custom stopwords. Defaults to [].
        default_language (str, optional): Default language of the text. Defaults to 'english'.

    Returns:
        str: Modified text with removed stopwords
    """

In [50]:
txt = "I a am Ujjeal asrjg is the that aijerf"

In [56]:
remove_sw(txt2)

"function removes stopwords Args: text (str): String Object custom_stopwords (list, optional): List custom stopwords. Defaults []. default_language (str, optional): Default language text. Defaults 'english'. Returns: str: Modified text removed stopwords"

In [2]:
import numpy as np
np.array(['a', 'b'])

array(['a', 'b'], dtype='<U1')

In [21]:
# set(stopwords.words('bengli'))

OSError: No such file or directory: '/home/ujjwal/nltk_data/corpora/stopwords/bengli'

In [14]:
from nltk.tokenize import word_tokenize
def sw_remover2(text_:str):
    wrd_tokens = word_tokenize(text_)
    # Remove stopwords using NLTK
    new_filtered_words = [
        word for word in wrd_tokens if word.lower() not in stopwords.words('english')]
    
    # Join the filtered words to form a clean text
    new_clean_text = ' '.join(new_filtered_words)
    return new_clean_text


In [15]:
cleaned_text2 = remove_sw(cleaned_text)
cleaned_text3 = sw_remover2(cleaned_text)

In [19]:
word_frequency_counter(cleaned_text, True)

[(1, 'aaaasa'),
 (1, 'aaadareur'),
 (1, 'aaae'),
 (1, 'aaaea'),
 (1, 'aaaia'),
 (1, 'aaaraga'),
 (1, 'aaarcaitry'),
 (1, 'aaareraflerq'),
 (1, 'aaatsiaen'),
 (1, 'aacr'),
 (1, 'aae'),
 (1, 'aaediia'),
 (1, 'aaeiies'),
 (1, 'aaem'),
 (1, 'aafa'),
 (1, 'aafaryas'),
 (1, 'aafasiva'),
 (1, 'aafddad'),
 (1, 'aaffal'),
 (1, 'aafreat'),
 (1, 'aaftedt'),
 (1, 'aaftt'),
 (1, 'aaiaa'),
 (1, 'aalaia'),
 (1, 'aaleraaay'),
 (1, 'aalqeure'),
 (1, 'aamt'),
 (1, 'aana'),
 (1, 'aaneranieferste'),
 (1, 'aapo'),
 (1, 'aare'),
 (1, 'aaretaehas'),
 (1, 'aarfer'),
 (1, 'aaries'),
 (1, 'aasatanay'),
 (1, 'aasdafadaar'),
 (1, 'aasea'),
 (1, 'aastea'),
 (1, 'aataaat'),
 (1, 'aatesmal'),
 (1, 'aavaes'),
 (1, 'aavafd'),
 (1, 'aavanfa'),
 (1, 'aayi'),
 (1, 'aaysy'),
 (1, 'abaartwae'),
 (1, 'abab'),
 (1, 'abala'),
 (1, 'abandonment'),
 (1, 'abatement'),
 (1, 'abbess'),
 (1, 'abbesses'),
 (1, 'abbey'),
 (1, 'abbots'),
 (1, 'abbott'),
 (1, 'abdicate'),
 (1, 'abdominal'),
 (1, 'abe'),
 (1, 'abegging'),
 (1, 'abhydsa'