In [55]:
import pandas as pd 
import numpy as np
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [56]:
!pip install pypdf



In [57]:
from pypdf import PdfReader
import os

In [58]:
def extract_text_from_pdfs(pdf_folder):
    """
    Extracts text from all PDFs in a given folder and returns a Pandas DataFrame.

    Args:
        pdf_folder (str): Path to the folder containing PDF files.

    Returns:
        pd.DataFrame: A DataFrame containing extracted text with columns: 
                      ["File Name", "Page Number", "Text"]
    """
    all_data = []

    # List all PDF files in the given folder
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        reader = PdfReader(pdf_path)

        # Extract text from each page
        for page_number, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:  # Avoid empty pages
                all_data.append([pdf_file, page_number + 1, text])

    # Convert to DataFrame
    df = pd.DataFrame(all_data, columns=["File Name", "Page Number", "Text"])
    return df

# Example Usage:
pdf_folder = "research_papers"  # Replace with the actual folder path
df = extract_text_from_pdfs(pdf_folder)

# Display the extracted text
df.to_csv("extracted_text.csv", index=False)
print(df)


                                            File Name  Page Number  \
0                                    2305.05422v1.pdf            1   
1                                    2305.05422v1.pdf            2   
2                                    2305.05422v1.pdf            3   
3                                    2305.05422v1.pdf            4   
4                                    2305.05422v1.pdf            5   
5                                    2305.05422v1.pdf            6   
6                                    2305.05422v1.pdf            7   
7                                    2305.05422v1.pdf            8   
8                                    2305.05422v1.pdf            9   
9                                    2305.05422v1.pdf           10   
10  A_Conceptual_Model_for_Implementing_Explainabl...            1   
11  A_Conceptual_Model_for_Implementing_Explainabl...            2   
12  A_Conceptual_Model_for_Implementing_Explainabl...            3   
13  A_Conceptual_Mod

In [59]:
df.head(20)

Unnamed: 0,File Name,Page Number,Text
0,2305.05422v1.pdf,1,L. Erculiani et al. /\nEgocentric Hierarchical...
1,2305.05422v1.pdf,2,L. Erculiani et al. /\nMusical instrumentsG: d...
2,2305.05422v1.pdf,3,L. Erculiani et al. /\nv1\n v2\n v3\nE\nFigure...
3,2305.05422v1.pdf,4,L. Erculiani et al. /\nAlgorithm 1The main loo...
4,2305.05422v1.pdf,5,L. Erculiani et al. /\nAlgorithm 2The procedur...
5,2305.05422v1.pdf,6,L. Erculiani et al. /\ncurrentGenus candidateG...
6,2305.05422v1.pdf,7,L. Erculiani et al. /\nFigure 5. Comparison be...
7,2305.05422v1.pdf,8,L. Erculiani et al. /\n5. Related work\nOur wo...
8,2305.05422v1.pdf,9,L. Erculiani et al. /\nReferences\n[1] Kuang Z...
9,2305.05422v1.pdf,10,"L. Erculiani et al. /\n[28] Fu Y , Dong H, Ma ..."


In [60]:
! pip install spacy



In [61]:
# Data preprocessing
df.describe()

Unnamed: 0,Page Number
count,46.0
mean,5.826087
std,3.535875
min,1.0
25%,3.0
50%,5.5
75%,8.0
max,14.0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   File Name    46 non-null     object
 1   Page Number  46 non-null     int64 
 2   Text         46 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.2+ KB


In [63]:
# check for null values
df.isnull().sum()

File Name      0
Page Number    0
Text           0
dtype: int64

In [64]:
# proceed with nlp processing 
import spacy 
import nltk # natural language package similar to spacy 
import string
import re # regular expression
from nltk.corpus import stopwords
from tqdm import tqdm
import pandas as pd
import numpy as np 

In [65]:
# Lowering the text in the data df
df['Text_lowered'] = df['Text'].str.lower()


In [66]:
df.head()

Unnamed: 0,File Name,Page Number,Text,Text_lowered
0,2305.05422v1.pdf,1,L. Erculiani et al. /\nEgocentric Hierarchical...,l. erculiani et al. /\negocentric hierarchical...
1,2305.05422v1.pdf,2,L. Erculiani et al. /\nMusical instrumentsG: d...,l. erculiani et al. /\nmusical instrumentsg: d...
2,2305.05422v1.pdf,3,L. Erculiani et al. /\nv1\n v2\n v3\nE\nFigure...,l. erculiani et al. /\nv1\n v2\n v3\ne\nfigure...
3,2305.05422v1.pdf,4,L. Erculiani et al. /\nAlgorithm 1The main loo...,l. erculiani et al. /\nalgorithm 1the main loo...
4,2305.05422v1.pdf,5,L. Erculiani et al. /\nAlgorithm 2The procedur...,l. erculiani et al. /\nalgorithm 2the procedur...


In [67]:
#removal of unwanted charecters and punctuations
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [68]:
df['Text_cleaned'] = df['Text_lowered'].apply(lambda x: clean_text(x))

In [69]:
df.head()

Unnamed: 0,File Name,Page Number,Text,Text_lowered,Text_cleaned
0,2305.05422v1.pdf,1,L. Erculiani et al. /\nEgocentric Hierarchical...,l. erculiani et al. /\negocentric hierarchical...,l erculiani et al \negocentric hierarchical vi...
1,2305.05422v1.pdf,2,L. Erculiani et al. /\nMusical instrumentsG: d...,l. erculiani et al. /\nmusical instrumentsg: d...,l erculiani et al \nmusical instrumentsg devic...
2,2305.05422v1.pdf,3,L. Erculiani et al. /\nv1\n v2\n v3\nE\nFigure...,l. erculiani et al. /\nv1\n v2\n v3\ne\nfigure...,l erculiani et al \nv1\n v2\n v3\ne\nfigure 2 ...
3,2305.05422v1.pdf,4,L. Erculiani et al. /\nAlgorithm 1The main loo...,l. erculiani et al. /\nalgorithm 1the main loo...,l erculiani et al \nalgorithm 1the main loop o...
4,2305.05422v1.pdf,5,L. Erculiani et al. /\nAlgorithm 2The procedur...,l. erculiani et al. /\nalgorithm 2the procedur...,l erculiani et al \nalgorithm 2the procedure p...


In [70]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------- ----------------------- 5.2/12.8 MB 53.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 44.5 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [71]:
# Lemmatization
nlp = spacy.load("en_core_web_sm", disable={"ner"})

In [72]:
# Lemmatization of df
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [73]:
df['Text_lemmatized'] = df['Text_cleaned'].apply(lambda x: lemmatize_text(x))

In [74]:
df.head()

Unnamed: 0,File Name,Page Number,Text,Text_lowered,Text_cleaned,Text_lemmatized
0,2305.05422v1.pdf,1,L. Erculiani et al. /\nEgocentric Hierarchical...,l. erculiani et al. /\negocentric hierarchical...,l erculiani et al \negocentric hierarchical vi...,l erculiani et al \n egocentric hierarchical v...
1,2305.05422v1.pdf,2,L. Erculiani et al. /\nMusical instrumentsG: d...,l. erculiani et al. /\nmusical instrumentsg: d...,l erculiani et al \nmusical instrumentsg devic...,l erculiani et al \n musical instrumentsg devi...
2,2305.05422v1.pdf,3,L. Erculiani et al. /\nv1\n v2\n v3\nE\nFigure...,l. erculiani et al. /\nv1\n v2\n v3\ne\nfigure...,l erculiani et al \nv1\n v2\n v3\ne\nfigure 2 ...,l erculiani et al \n v1 \n v2 \n v3 \n e \n ...
3,2305.05422v1.pdf,4,L. Erculiani et al. /\nAlgorithm 1The main loo...,l. erculiani et al. /\nalgorithm 1the main loo...,l erculiani et al \nalgorithm 1the main loop o...,l erculiani et al \n algorithm 1the main loop ...
4,2305.05422v1.pdf,5,L. Erculiani et al. /\nAlgorithm 2The procedur...,l. erculiani et al. /\nalgorithm 2the procedur...,l erculiani et al \nalgorithm 2the procedure p...,l erculiani et al \n algorithm 2the procedure ...


In [85]:
import nltk
nltk.download("punkt")  # Sentence and word tokenization
nltk.download("stopwords")  # If you use stopwords
nltk.download("wordnet")  # If using lemmatization
nltk.download("averaged_perceptron_tagger")  # If using POS tagging


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [84]:
import nltk
print(nltk.__version__)

3.9.1


In [88]:
import nltk
nltk.data.path.append(r"C:\Users\ramya\OneDrive\Desktop\Ramya\VU - Masters\period_3\Kg_group_assignment\nltk_data")

In [89]:
import nltk
print(nltk.data.path)


['C:\\Users\\ramya/nltk_data', 'c:\\Users\\ramya\\anaconda3\\nltk_data', 'c:\\Users\\ramya\\anaconda3\\share\\nltk_data', 'c:\\Users\\ramya\\anaconda3\\lib\\nltk_data', 'C:\\Users\\ramya\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data', 'nltk_data', 'C:\\Users\\ramya\\OneDrive\\Desktop\\Ramya\\VU - Masters\\period_3\\Kg_group_assignment\\nltk_data']


In [90]:
#tokenization of lemmatized text
from nltk.tokenize import word_tokenize
def tokenize_text(text):
    return word_tokenize(text)

In [91]:
df['Text_tokens'] = df['Text_lemmatized'].apply(lambda x: tokenize_text(x))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\ramya/nltk_data'
    - 'c:\\Users\\ramya\\anaconda3\\nltk_data'
    - 'c:\\Users\\ramya\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\ramya\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\ramya\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'nltk_data'
    - 'C:\\Users\\ramya\\OneDrive\\Desktop\\Ramya\\VU - Masters\\period_3\\Kg_group_assignment\\nltk_data'
**********************************************************************


In [None]:
#removing stopwords from tokens
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens