# Part 1:Rule Based NLP and Regex:

In [16]:
import re
import pandas as pd
from tabulate import tabulate # to make data into table

## Using Regex and Rule Bases NLP we will generate a bill from a text given by the user.
## Use case:

In [2]:
text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"

## We define stopwords in our use case to eliminate unnecessary words disturbing our item extraction

In [3]:
stop_words = ["bought", "for", "fresh", "i", "each", "kilos", "of", "a", "an", "with", "and", "dollar", "kilogram"]

## We remove the stop words from our text

In [4]:
# Supprimer les mots vides
cleaned_text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
cleaned_text

'three Samsung smartphones 150 $ each, four banana 1,2 one Hamburger 4,5'

In [5]:
item_pattern = r'(?:[A-Za-z]+\s)+[A-Za-z]+'
items = re.findall(item_pattern, cleaned_text)
items

['three Samsung smartphones', 'four banana', 'one Hamburger']

In [6]:
quantity_pattern = r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten)\b'
quantities = re.findall(quantity_pattern, cleaned_text)
quantities

['three', 'four', 'one']

## Removing the "quantities" words from the "items" as they are also words 

In [7]:
def extract_items_without_quantities(items, quantities):
    # Parcourir les éléments et les quantités
    for i, item in enumerate(items):
        for quantity in quantities:
            # Supprimer la quantité de l'élément
            item = item.replace(quantity, '').strip()
        # Mettre à jour l'élément dans la liste des éléments
        items[i] = item
    return items 

In [8]:
items = [re.sub(quantity_pattern, '', item).strip() for item in items]
items

['Samsung smartphones', 'banana', 'Hamburger']

In [9]:
# Testons la fonction
test_items = ["three Samsung smartphones", "four kilos of banana", "one Hamburger"]
test_quantities = ["three", "four", "one"]

print("Test Items before:", test_items)
print("Test Quantities:", test_quantities)

test_items = extract_items_without_quantities(test_items, test_quantities)
print("Test Items after:", test_items)

Test Items before: ['three Samsung smartphones', 'four kilos of banana', 'one Hamburger']
Test Quantities: ['three', 'four', 'one']
Test Items after: ['Samsung smartphs', 'kilos of banana', 'Hamburger']


## Extract the numbers int or float from the text 

In [10]:
price_pattern = r'\d+(?:\,\d+)?'
prices = re.findall(price_pattern, text)
prices

['150', '1,2', '4,5']

## Converting the quantities of the items from string to int by mapping them 

In [11]:
quantity_mapping = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
                        'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
quantities = [quantity_mapping[q.lower()] for q in quantities]
quantities

[3, 4, 1]

## Converting the unit prices extracted from string to float and calculating the total price 

In [12]:
prices = [float(p.replace(',', '.')) for p in prices]

total_prices = [quantity * price for quantity, price in zip(quantities, prices)]
total_prices

[450.0, 4.8, 4.5]

## Creating a dataframe with our extracted data 

In [13]:
# Créer un DataFrame
df = pd.DataFrame({
    'Product': items,
    'Quantity': quantities,
    'Unit Price': prices,
    'Total Price': total_prices
})
df

Unnamed: 0,Product,Quantity,Unit Price,Total Price
0,Samsung smartphones,3,150.0,450.0
1,banana,4,1.2,4.8
2,Hamburger,1,4.5,4.5


In [14]:
def generate_bill(text):
    # Définir les mots vides
    stop_words = ["bought", "for", "fresh", "i", "each", "kilos", "of", "a", "an", "with", "and", "dollar", "kilogram"]

    # Supprimer les mots vides
    cleaned_text = ' '.join(word for word in text.split() if word.lower() not in stop_words)

    # Définir les motifs regex pour correspondre aux éléments, quantités et prix
    item_pattern = r'(?:[A-Za-z]+\s)+[A-Za-z]+'
    price_pattern = r'\d+(?:\,\d+)?'
    quantity_pattern = r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten)\b'

    # Extraire les éléments, les quantités et les prix à l'aide des expressions régulières
    items = re.findall(item_pattern, cleaned_text)
    prices = re.findall(price_pattern, text)
    quantities = re.findall(quantity_pattern, text)

    # Supprimer les quantités des éléments
    items = [re.sub(quantity_pattern, '', item).strip() for item in items]

    # Convertir les quantités en integers
    quantity_mapping = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
                        'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
    quantities = [quantity_mapping[q.lower()] for q in quantities]

    # Convertir les prix en nombres floats
    prices = [float(p.replace(',', '.')) for p in prices]

    # Calculer total prices
    total_prices = [quantity * price for quantity, price in zip(quantities, prices)]

    # Créer un DataFrame
    df = pd.DataFrame({
        'Product': items,
        'Quantity': quantities,
        'Unit Price': prices,
        'Total Price': total_prices
    })

    # Imprimer Bill  sous forme de tableau
    print("Generated Bill:")
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"
generate_bill(text)

Generated Bill:
+---------------------+------------+--------------+---------------+
| Product             |   Quantity |   Unit Price |   Total Price |
| Samsung smartphones |          3 |        150   |         450   |
+---------------------+------------+--------------+---------------+
| banana              |          4 |          1.2 |           4.8 |
+---------------------+------------+--------------+---------------+
| Hamburger           |          1 |          4.5 |           4.5 |
+---------------------+------------+--------------+---------------+


In [15]:
df

Unnamed: 0,Product,Quantity,Unit Price,Total Price
0,Samsung smartphones,3,150.0,450.0
1,banana,4,1.2,4.8
2,Hamburger,1,4.5,4.5


# Potential improvements could be generalizing it to any user input