In [None]:
!pip install pdfplumber

In [None]:
import pdfplumber
import pandas as pd

def extract_data_from_pdf(pdf_file_path):
    data = []
    with pdfplumber.open(pdf_file_path) as pdf:
        for page in pdf.pages:
            # Extract text data from the page
            text = page.extract_text()
            # Split text into lines
            lines = text.split('\n')
            # Initialize a dictionary to hold data for this page
            page_data = {}
            # Extract data based on column names
            for line in lines:
                if "Sr No." in line or "Reference No (URN)" in line:
                    page_data["Sr No."] = line.split("Sr No.")[-1].strip()
                elif "Journal Date" in line:
                    page_data["Journal Date"] = line.split("Journal Date")[-1].strip()
                elif "Date of Purchase" in line:
                    page_data["Date of Purchase"] = line.split("Date of Purchase")[-1].strip()
                elif "Date of Expiry" in line:
                    page_data["Date of Expiry"] = line.split("Date of Expiry")[-1].strip()
                elif "Name of the Purchaser" in line:
                    page_data["Name of the Purchaser"] = line.split("Name of the Purchaser")[-1].strip()
                elif "Prefix" in line:
                    page_data["Prefix"] = line.split("Prefix")[-1].strip()
                elif "Bond Number" in line:
                    page_data["Bond Number"] = line.split("Bond Number")[-1].strip()
                elif "Denominations" in line:
                    page_data["Denominations"] = line.split("Denominations")[-1].strip()
                elif "Issue Branch Code" in line:
                    page_data["Issue Branch Code"] = line.split("Issue Branch Code")[-1].strip()
                elif "Issue Teller" in line:
                    page_data["Issue Teller"] = line.split("Issue Teller")[-1].strip()
                elif "Status" in line:
                    page_data["Status"] = line.split("Status")[-1].strip()
            # Add page data to the overall data list
            data.append(page_data)
    return data

# Provide the path to your PDF file
pdf_file_path = "data/Electoral Bonds Purchase.pdf"
extracted_data = extract_data_from_pdf(pdf_file_path)

# Convert extracted data to DataFrame
df = pd.DataFrame(extracted_data)

# Convert 'Denominations' column to numeric
df['Denominations'] = pd.to_numeric(df['Denominations'], errors='coerce')

# Drop rows with NaN values in 'Name of the Purchaser' or 'Denominations'
df.dropna(subset=['Name of the Purchaser', 'Denominations'], inplace=True)

# Group by 'Name of the Purchaser' and sum 'Denominations' for each company
company_totals = df.groupby('Name of the Purchaser')['Denominations'].sum()

# Visualize the data
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
company_totals.plot(kind='bar')
plt.title('Total Denominations Purchased by Company')
plt.xlabel('Company')
plt.ylabel('Total Denominations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()