#### Description summarizer

In [2]:
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# List of keywords to exclude
keywords_to_exclude = {"UPI", "Bank", "NET", "BANK", "Payment", "PAYMENT", "P2M", "Yes", "NEFT", "DR", "CR", "MUM"}

def summarize_description(description):
    doc = nlp(description)

    # Extract named entities
    entities = [ent.text for ent in doc.ents]

    # Extract noun chunks
    nouns = [chunk.text for chunk in doc.noun_chunks]

    # Extract proper nouns (PROPN) and exclude specified keywords
    proper_nouns = [
        token.text for token in doc
        if token.pos_ == "PROPN"
        and token.text not in keywords_to_exclude
        and not any(char.isdigit() for char in token.text)  # Exclude strings with numbers
    ]
    # print("entities: " + str(entities))
    # print("nouns: " + str(nouns))
    # print("proper_nouns: " + str(proper_nouns))

    if proper_nouns:
        # If proper nouns are found, return them as the summary
        summary_note = " ".join(proper_nouns)
        # summary_note = " ".join(proper_nouns[:4])
    else:
        # Default summarization if no proper nouns are found
        summary_note = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct])

    # print(f"Summary Note: {summary_note}")
    return " ".join(proper_nouns) if proper_nouns else ""

# Example transaction description
description = "UPI/P2M/360994847130/ARUN THAK/Yes Bank/Payment"
description1 = "MB NEFT DR P219240341079906 RAGHAVENDRA K M SBIN0011291 40688305525 MUTUAL FUND INVESTMENT"
description2 = "UPI/CR/459033982196/MITHUNG/HDFC/**FEB22@YBL/PAYMENT//YBL5625707364E34B09B556CCCB9F87FE6A/11/08/202423:20:53"
description3 = "NACH INDIANCLEARINGCORP YN9XZNQKUZPH CNRB7030702244001099"
description4 = "NEFT DR-SBIN0014517-BINAY KUMAR SHAW-NET, BANK, MUM-N222160177190622"
description5 = "50100529373291-TPT-JULY CAR EMI-GOPIKRISHNA DAS"
description6 = "UPI-AFSANA-PAYTMQRPZ1UDW8HUU@PAYTM-YESB0, PTMUPI-421625738999-PAYMENT FROM PHONE"
description7 = "UPI-CARATLANE A TANISHQ-DIGIGOLDCARATLA, NE@YBL-YESB0YBLUPI-421698116834-GOLD WIL, L BE PURCH"
description8 = "UPI-SATISH MAHENDRA PAN-SATISHPANCHAL65, 1@OKSBI-SBIN0018360-421727965746-BADMINTON"
description9 = "UPI-LEO SPORTS-Q370307376@YBL-YESB0YBLUP, I-458382538714-PAYMENT FROM PHONE"

summary = summarize_description(description1)
print(summary)

RAGHAVENDRA K M MUTUAL FUND INVESTMENT


#### Add a new column (note) by summarizing the description colum

In [None]:
import pandas as pd
from IPython.display import display

csv_path = "/content/drive/MyDrive/SampleBankPdfs/outputs/hdfc_camelot_stream.csv"

df = pd.read_csv(csv_path)

# print(df)

df['note'] = df['Narration'].apply(summarize_description)

display(df)

df.to_csv('/content/drive/MyDrive/SampleBankPdfs/outputs/hdfc_summarized.csv', index=False)