# ETL on PDF

## Imports

In [None]:
import os
import PyPDF2
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

## Data Loading 

In [None]:
def load_all_pdfs_pypdf2(folder_path):
    all_data = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    for page_num, page in enumerate(reader.pages):
                        text = page.extract_text()
                        if text:
                            lines = text.split('\n')
                            for line in lines:
                                clean_line = line.strip()
                                if clean_line:
                                    all_data.append({
                                        'file': filename,
                                        'page': page_num + 1,
                                        'line': clean_line
                                    })
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    return pd.DataFrame(all_data)


## Clean PDF Data

In [None]:
def clean_pdf_df(df):
    # Strip leading and trailing whitespace
    df['line'] = df['line'].str.strip()
    
    # Remove empty rows
    df = df[df['line'] != '']
    
    # Remove duplicate words in each 'line'
    df['line'] = df['line'].apply(lambda x: ' '.join(sorted(set(x.split()), key=x.split().index)))
    
    # Remove duplicate rows based on the 'line' column
    df = df.drop_duplicates(subset=['line'])
    
    # Reset the index
    return df.reset_index(drop=True)

## Explore or Transform

In [None]:
def word_frequency(df):
    words = ' '.join(df['line']).split()
    return Counter(words)

## Example Usage

In [None]:
folder_path = "data/pdfs"  # Your folder path
pdf_df = load_all_pdfs_pypdf2(folder_path)
pdf_df = clean_pdf_df(pdf_df)
word_count = word_frequency(pdf_df)
print(word_count.most_common(5)) ## word amount of word apperances 
print(pdf_df.head())
print(pdf_df.tail())

## Visualize PDF

In [None]:
def plot_wordcloud(text):
    wc = WordCloud(width=1200, height=800, background_color='white').generate(text)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title("Word Cloud from PDFs")
    plt.show()

all_text = ' '.join(pdf_df['line'])
plot_wordcloud(all_text)