In [1]:
## Lets get the sample data into a particular format

In [4]:
import pandas as pd

In [34]:
import pandas as pd

# Define the data
data = {
    'User': ['User1', 'User1', 'User2', 'User2', 'User1', 'User2', 'User3', 'User4', 'User5', 'User5', 'User5'],
    'Transaction': [
        'MPS/TRUFFLES /202303261700/034587/Bangalore',
        'MPS/TACO BELL /202304012247/108300/BANGALORE',
        'POS XXXXXXXXXXXX0001 APOLLO PHARMACY',
        'BIL/ONL/000471093694/1MG TechnoX7ZRUSVLURFQZO',
        'POS XXXXXXXXXXXX1111 DECATHLON SPORTS',
        'POS XXXXXXXXXXXX1111 IKEA INDIA PVT L',
        'POS XXXXXXXXXXXX1111 WWW AMAZON IN',
        'MEDCSIXXXXXXXXXXXX1111 SPOTIFY SI',
        'POS/NETFLIX/1140920002/100623/17:25',
        'POS XXXXXXXXXXXX1110 MAKEMYTRIP INDIA',
        'BIL/ONL/000691178015/IRCTCServ/XZZBX91LTCY1AZ'
    ],
    'Tag': ['Food', 'Food', 'Medical', 'Medical', 'Shopping', 'Shopping', 'Shopping', 'Subscription', 'Subscription', 'Travel', 'Travel']
}

# Create the DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
df.head()

Unnamed: 0,User,Transaction,Tag
0,User1,MPS/TRUFFLES /202303261700/034587/Bangalore,Food
1,User1,MPS/TACO BELL /202304012247/108300/BANGALORE,Food
2,User2,POS XXXXXXXXXXXX0001 APOLLO PHARMACY,Medical
3,User2,BIL/ONL/000471093694/1MG Techno/X7ZRUSVLURFQZO,Medical
4,User1,POS XXXXXXXXXXXX1111 DECATHLON SPORTS,Shopping


There is too little data for us to work with this let us create a synthetic data based out of same so that we could train on this data.

In [48]:
import random
def generate_synthetic_data(num_users, places_by_tag, num_transactions_per_user):
    # Initialize lists to hold the synthetic data
    synthetic_users = []
    synthetic_transactions = []
    synthetic_tags = []

    # Initialize a dictionary to hold the places used for each user
    places_used_by_user = {}

    # List of possible transaction prefixes
    transaction_prefixes = ['MPS', 'POS', 'BIL', 'ME DC SI']

    # Loop over the number of users
    for i in range(1, num_users + 1):
        user = f'User{i}'
        # Initialize the list of places used for this user
        places_used_by_user[user] = []

        # Generate synthetic transactions for this user
        for _ in range(num_transactions_per_user):
            # Randomly select a tag
            tag = random.choice(list(places_by_tag.keys()))

            # Randomly select a place for this transaction that hasn't been used for this user yet
            unused_places = [p for p in places_by_tag[tag] if p not in places_used_by_user[user]]
            if not unused_places:  # if all places have been used, allow places to be reused
                unused_places = places_by_tag[tag]
            place = random.choice(unused_places)

            # Add the place to the list of places used for this user
            places_used_by_user[user].append(place)

            # Randomly select a transaction prefix
            transaction_prefix = random.choice(transaction_prefixes)

            # Randomly decide whether to include a sequence of 'X's
            if random.random() < 0.5:  # 50% chance of including 'X's
                xs = 'X' * random.randint(10, 15)  # random number of 'X's between 10 and 15
                transaction = f"{transaction_prefix} {xs} {place} /{random.randint(202300000000, 202399999999)}/{random.randint(100000, 999999)}/BANGALORE"
            else:
                transaction = f"{transaction_prefix}/{place} /{random.randint(202300000000, 202399999999)}/{random.randint(100000, 999999)}/BANGALORE"

            # Add the synthetic data to the lists
            synthetic_users.append(user)
            synthetic_transactions.append(transaction)
            synthetic_tags.append(tag)

    # Create a DataFrame from the synthetic data
    synthetic_data = pd.DataFrame({
        'User': synthetic_users,
        'Transaction': synthetic_transactions,
        'Tag': synthetic_tags
    })

    return synthetic_data


In [73]:
num_users = 20
places_by_tag = {
    'Food': ['TRUFFLES', 'TACO BELL', 'MCDONALDS', 'BURGER KING', 'HALDIRAMS', 'PARADISE BIRYANI', 'SARAVANA BHAVAN', 'KARIM’S', 'BARBEQUE NATION', 'DOMINOS PIZZA', 'MANOJ SWEETS', 'KANTI SWEETS', 'ESHA SWEETS', 'NAGARJUNA BIRYANI', 'MANIS DUM BIRYANI', 'MEGHNA FOODS'],
    'Medical': ['APOLLO PHARMACY', '1MG', 'MEDPLUS', 'NETMEDS', 'FORTIS', 'MAX HEALTHCARE', 'MEDANTA', 'AIIMS', 'COLUMBIA ASIA', 'NARAYANA HEALTH', 'ARUN MEDICAL', 'RAVI MEDICINE'],
    'Shopping': ['DECATHLON', 'IKEA', 'AMAZON', 'FLIPKART', 'BIG BAZAAR', 'RELIANCE DIGITAL', 'TANISHQ', 'FABINDIA', 'PANTALOONS', 'SHOPPERS STOP'],
    'Subscription': ['SPOTIFY', 'NETFLIX', 'PRIME VIDEO', 'HOTSTAR', 'ZEE5', 'SONY LIV', 'DISNEY+ HOTSTAR', 'ALT BALAJI', 'GAANA', 'VODAFONE RED', 'BUMBLE', 'TINDER'],
    'Travel': ['MAKEMYTRIP', 'IRCTC', 'YATRA', 'GOIBIBO', 'AIR INDIA', 'INDIGO', 'SPICEJET', 'OYO ROOMS', 'TRIVAGO', 'BOOKING.COM']
}

num_transactions_per_user = 50

synthetic_data = generate_synthetic_data(num_users, places_by_tag, num_transactions_per_user)
synthetic_data.head()

Unnamed: 0,User,Transaction,Tag
0,User1,ME DC SI XXXXXXXXXX SHOPPERS STOP /20231215882...,Shopping
1,User1,POS/APOLLO PHARMACY /202339994391/268481/BANGA...,Medical
2,User1,POS/PRIME VIDEO /202331317775/123219/BANGALORE,Subscription
3,User1,MPS/INDIGO /202378233191/401159/BANGALORE,Travel
4,User1,ME DC SI/MEDANTA /202375483779/131136/BANGALORE,Medical


In [74]:
# Shuffle the dataframe
synthetic_data = synthetic_data.sample(frac=1).reset_index().drop(columns=['index'])

In [80]:
synthetic_data.to_csv('synthetic_txn_data.csv',index=False)

For further use we will clean this Transaction column to remove unnessary strings.

In [76]:
import unidecode
import string
import re

# This function is used to clean the text data.
import unidecode
import string
import re

# This function is used to clean the text data.
def cleaning(s):
    # List of known transaction prefixes
    transaction_prefixes = ['mps', 'pos', 'bil', 'me dc si']

    # Remove the transaction prefix if it is found at the start of the string
    for prefix in transaction_prefixes:
        if s.startswith(prefix):
            s = s[len(prefix):].lstrip()  # remove the prefix and any leading whitespace
            break

    # Replace any sequence of three or more 'X's with a single 'X'
    s = re.sub(r"x{3,}"," ", s)
 
    # Convert the text to lowercase. This is done to ensure that the algorithm does not treat the same words in different cases as different.
    s = s.lower()
    # Remove any accented characters. For example, "café" becomes "cafe".
    s = unidecode.unidecode(s)
    # Replace any sequence of digits with a single "%". This is done to generalize all numbers, as specific numbers might not be useful for the task.
    s = re.sub(r"[0-9]+", "%", s)
    return s

    
def preprocessing(df):
    # Apply the cleaning function to each Transaction in the DataFrame.
    df['Transaction'] = df['Transaction'].apply(lambda x:x.lower())
    df['Transaction'] = df['Transaction'].map(cleaning)
    # Create a translation table that maps every punctuation character to a space.
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    # Use the translation table to replace punctuation with spaces in each Transaction.
    df['Transaction'] = [x.translate(translator) for x in df['Transaction']]
    # Tokenize the Transaction by splitting it into words. Only keep the words that consist entirely of alphabetic characters.
    df['Transaction'] = df['Transaction'].map(lambda x: [word for word in x.split(' ') if word.isalpha()])
    # Join the words back together into a single string with spaces in between.
    df['Transaction'] = df['Transaction'].map(lambda x: ' '.join(x))
    return df

reformed_data = preprocessing(synthetic_data.copy())

In [82]:
ref_data = reformed_data

In [85]:
ref_data

Unnamed: 0,User,Transaction,Tag
0,User10,mg bangalore,Medical
1,User3,amazon bangalore,Shopping
2,User6,ravi medicine bangalore,Medical
3,User6,columbia asia bangalore,Medical
4,User9,meghna foods bangalore,Food
...,...,...,...
995,User4,medanta bangalore,Medical
996,User6,flipkart bangalore,Shopping
997,User13,flipkart bangalore,Shopping
998,User3,tanishq bangalore,Shopping


In [84]:
ref_data.to_csv('preprocessed_data.csv',index=False)

In [78]:
preprocessing(df)

Unnamed: 0,User,Transaction,Tag
0,User1,truffles bangalore,Food
1,User1,taco bell bangalore,Food
2,User2,apollo pharmacy,Medical
3,User2,onl mg techno x zrusvlurfqzo,Medical
4,User1,decathlon sports,Shopping
5,User2,ikea india pvt l,Shopping
6,User3,www amazon in,Shopping
7,User4,spotify si,Subscription
8,User5,netflix,Subscription
9,User5,makemytrip india,Travel


In [44]:
# from transformers import pipeline

# # Initialize a text generation pipeline with the gpt-3 model
# generator = pipeline('text-generation', model='gpt-3')

# def extract_merchant(transaction_text):
#     # Craft a prompt that instructs the model to extract the merchant name from the transaction text
#     prompt = f"I have a transaction text: '{transaction_text}'. What is the name of the merchant in this transaction?"

#     # Use the generator to answer the prompt
#     response = generator(prompt, max_length=100, do_sample=False)

#     # Extract the merchant name from the model's response
#     merchant_name = response[0]['generated_text'].split(':')[-1].strip()

#     return merchant_name

# transaction_text = "POS XXXXXXXXXXXX0001 APOLLO PHARMACY"
# print(extract_merchant(transaction_text))