In [1]:
!pip install requests beautifulsoup4 pandas openpyxl PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import os


In [3]:
def extract_credit_cards_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        num_pages = len(reader.pages)
        text = ''
        for page in range(num_pages):
            text += reader.pages[page].extract_text()
    return parse_credit_card_text(text)

def parse_credit_card_text(text):
    # Simple example: Extracting key terms from the text (e.g., fees, rewards)
    cards = []
    lines = text.split('\n')
    card_info = {}

    for line in lines:
        if 'Card Name' in line:
            if card_info:
                cards.append(card_info)
            card_info = {'Card Name': line.split(':')[-1].strip()}
        elif 'Joining Fee' in line:
            card_info['Joining Fee'] = line.split(':')[-1].strip()
        elif 'Annual Fee' in line:
            card_info['Annual Fee'] = line.split(':')[-1].strip()
        elif 'Rewards' in line:
            card_info['Rewards / Cashback'] = line.split(':')[-1].strip()
        elif 'Lounge Access' in line:
            card_info['Lounge Access'] = line.split(':')[-1].strip()
        elif 'Fuel Surcharge' in line:
            card_info['Fuel Surcharge Waiver'] = line.split(':')[-1].strip()
        elif 'EMI' in line:
            card_info['EMI Options'] = line.split(':')[-1].strip()
        elif 'Other Features' in line:
            card_info.setdefault('Other Features', []).append(line.split(':')[-1].strip())

    if card_info:
        cards.append(card_info)

    return cards


In [4]:
def extract_credit_cards_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    cards = []
    card_sections = soup.find_all('div', class_='card-details')

    for section in card_sections:
        card_info = {}

        # Extract Card Name
        name_tag = section.find('h3')
        if name_tag:
            card_info['Card Name'] = name_tag.get_text(strip=True)

        # Extract Features
        features_list = section.find_all('li')
        for feature in features_list:
            text = feature.get_text(strip=True)
            if 'Joining Fee' in text:
                card_info['Joining Fee'] = text.split(':')[-1].strip()
            elif 'Annual Fee' in text:
                card_info['Annual Fee'] = text.split(':')[-1].strip()
            elif 'Cashback' in text or 'Reward' in text:
                card_info['Rewards / Cashback'] = text
            elif 'Lounge' in text:
                card_info['Lounge Access'] = text
            elif 'Fuel' in text:
                card_info['Fuel Surcharge Waiver'] = text
            elif 'EMI' in text:
                card_info['EMI Options'] = text
            else:
                card_info.setdefault('Other Features', []).append(text)

        # Combine other features into a single string
        if 'Other Features' in card_info:
            card_info['Other Features'] = '; '.join(card_info['Other Features'])

        cards.append(card_info)

    return cards


In [5]:
def main():
    source_type = input("Enter 'pdf' or 'url': ").strip().lower()

    if source_type == 'pdf':
        pdf_path = input("Enter PDF file path: ").strip()
        if os.path.exists(pdf_path):
            credit_card_data = extract_credit_cards_from_pdf(pdf_path)
        else:
            print("Invalid file path!")
            return

    elif source_type == 'url':
        url = input("Enter website URL: ").strip()
        credit_card_data = extract_credit_cards_from_website(url)

    else:
        print("Invalid input. Please enter 'pdf' or 'url'.")
        return

    if not credit_card_data:
        print("No cards found or the structure has changed.")
    else:
        df = pd.DataFrame(credit_card_data)
        df.to_excel("credit_cards.xlsx", index=False)
        print("Exported to credit_cards.xlsx ✅")

    # Optional: Save as CSV too
    df.to_csv("credit_cards.csv", index=False)
    print("Also exported to credit_cards.csv ✅")


In [8]:
main()


Enter 'pdf' or 'url': url
Enter website URL: https://www.hdfcbank.com/personal/pay/cards
Exported to credit_cards.xlsx ✅
Also exported to credit_cards.csv ✅


In [9]:
from google.colab import files

# Download the exported files
files.download("credit_cards.xlsx")
files.download("credit_cards.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>