In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
filepath1 = "dataset/providers_data.csv"
filepath2 = "dataset/receivers_data.csv"
filepath3 = "dataset/claims_data.csv"
filepath4 = "dataset/food_listings_data.csv"

In [65]:
provider_df = pd.read_csv(filepath1)

provider_null = provider_df.isnull().sum()
provider_dupli = provider_df.duplicated().sum()

print(f'\nProvider Null: {provider_null}')
print(f'\nProvider Duplicate: {provider_dupli}')


Provider Null: Provider_ID    0
Name           0
Type           0
Address        0
City           0
Contact        0
dtype: int64

Provider Duplicate: 0


In [7]:
provider_df

Unnamed: 0,Provider_ID,Name,Type,Address,City,Contact
0,1,Gonzales-Cochran,Supermarket,"74347 Christopher Extensions\nAndreamouth, OK ...",New Jessica,+1-600-220-0480
1,2,"Nielsen, Johnson and Fuller",Grocery Store,"91228 Hanson Stream\nWelchtown, OR 27136",East Sheena,+1-925-283-8901x6297
2,3,Miller-Black,Supermarket,"561 Martinez Point Suite 507\nGuzmanchester, W...",Lake Jesusview,001-517-295-2206
3,4,"Clark, Prince and Williams",Grocery Store,"467 Bell Trail Suite 409\nPort Jesus, IA 61188",Mendezmouth,556.944.8935x401
4,5,Coleman-Farley,Grocery Store,"078 Matthew Creek Apt. 319\nSaraborough, MA 53978",Valentineside,193.714.6577
...,...,...,...,...,...,...
995,996,"Vasquez, Ruiz and Flowers",Restaurant,"84308 Justin Stravenue\nNew Amberside, NE 53447",Williamview,+1-319-378-7627x0682
996,997,Garza-Williams,Catering Service,"08864 Figueroa Radial Suite 948\nJennaberg, AZ...",East Rossside,001-924-441-3963x746
997,998,Novak Group,Grocery Store,"934 Zachary Run\nMelissamouth, WY 02729",Joshuastad,(903)642-1969x3300
998,999,Moody Ltd,Grocery Store,"17580 Ernest Hills\nLake Michaelmouth, OR 56416",Stevenchester,637.300.3664x4880


In [66]:
if 'Address' in provider_df.columns:
    provider_df['Address'] = provider_df['Address'].astype(str).replace(r'[\n,]', ' ', regex=True)

# --- Contact normalization ---
def normalize_contact(contact):
    if pd.isna(contact):
        return ""
    contact = str(contact).strip()
    contact = re.sub(r'(?i)[\s\-.,]*(ext|x)[\s\-.,]*\d+', '', contact)
    main = re.sub(r'[^\d+]', '', contact)
    if not main:
        return ""
    if main.startswith('00'):
        main = '+' + main[2:]
    if not main.startswith('+'):
        if len(main) == 10:
            main = '+1' + main
        else:
            main = '+' + main
    return main

if 'Contact' in provider_df.columns:
    provider_df['Contact'] = provider_df['Contact'].apply(normalize_contact)

# --- Name normalization ---
def split_and_normalize_names(name):
    if pd.isna(name):
        return [""]
    name = str(name).replace('"', '').replace("'", '')
    parts = re.split(r'(?i)(?:,|\s+and\s+|\s+&\s+)', name)
    parts = [re.sub(r'[-,]', ' ', n).strip() for n in parts]
    return [re.sub(r'\s+', ' ', n).strip() for n in parts if n.strip()]

if 'Name' in provider_df.columns:
    provider_df['Name_Split'] = provider_df['Name'].apply(split_and_normalize_names)
    provider_df = provider_df.explode('Name_Split').copy()
    provider_df['Name'] = provider_df['Name_Split']
    provider_df.drop(columns=['Name_Split'], inplace=True)

# --- Lowercase columns ---
provider_df.columns = provider_df.columns.str.lower()

# --- Save CSV ---
provider_df.to_csv('provider_data_cleaned.csv', index=False, encoding='utf-8')

# --- Validation ---
prov_dup = pd.read_csv('provider_data_cleaned.csv', encoding='utf-8')
print(f'\nProvider Duplicate: {prov_dup.duplicated().sum()}')
print(f'\nProvider Null:\n{prov_dup.isnull().sum()}')


Provider Duplicate: 1

Provider Null:
provider_id    0
name           0
type           0
address        0
city           0
contact        0
dtype: int64


In [67]:
receiver_df = pd.read_csv(filepath2)

receiver_null = receiver_df.isnull().sum()
receiver_dupli = receiver_df.duplicated().sum()

print(f'\nProvider Null: {receiver_null}')
print(f'\nProvider Duplicate: {receiver_dupli}')


Provider Null: Receiver_ID    0
Name           0
Type           0
City           0
Contact        0
dtype: int64

Provider Duplicate: 0


In [68]:
if 'City' in receiver_df.columns:
    receiver_df['City'] = receiver_df['City'].astype(str).replace(r'[\n,]', ' ', regex=True)

# 2. Normalize Contact
def normalize_contact(contact):
    if pd.isna(contact):
        return ""
    contact = str(contact).strip()
    contact = re.sub(r'(?i)[\s\-.,]*(ext|x)[\s\-.,]*\d+', '', contact)
    main = re.sub(r'[^\d+]', '', contact)
    if not main:
        return ""
    if main.startswith('00'):
        main = '+' + main[2:]
    if not main.startswith('+'):
        if len(main) == 10:
            main = '+1' + main
        else:
            main = '+' + main
    return main

if 'Contact' in receiver_df.columns:
    receiver_df['Contact'] = receiver_df['Contact'].apply(normalize_contact)

# 3. Normalize Name and split
def split_and_normalize_names(name):
    if pd.isna(name):
        return [""]
    name = str(name).replace('"', '').replace("'", '')
    parts = re.split(r'(?i)(?:,|\s+and\s+|\s+&\s+)', name)
    parts = [re.sub(r'[-,]', ' ', p).strip() for p in parts]
    return [re.sub(r'\s+', ' ', p).strip() for p in parts if p.strip()]

if 'Name' in receiver_df.columns:
    receiver_df['Name_Split'] = receiver_df['Name'].apply(split_and_normalize_names)
    receiver_df = receiver_df.explode('Name_Split').copy()
    receiver_df['Name'] = receiver_df['Name_Split']
    receiver_df.drop(columns=['Name_Split'], inplace=True)

# 4. Lowercase column names
receiver_df.columns = receiver_df.columns.str.lower()

# 5. Save cleaned data
receiver_df.to_csv('receiver_data_cleaned.csv', index=False, encoding='utf-8')

# 6. Validation
recev_dup = pd.read_csv('receiver_data_cleaned.csv', encoding='utf-8')
print(f'\nReceiver Duplicate: {recev_dup.duplicated().sum()}')
print(f'\nReceiver Null:\n{recev_dup.isnull().sum()}')


Receiver Duplicate: 0

Receiver Null:
receiver_id    0
name           0
type           0
city           0
contact        0
dtype: int64


In [26]:
food_df = pd.read_csv(filepath4)

food_null = food_df.isnull().sum()
food_dupli = food_df.duplicated().sum()

print(f'\nFood Null: {food_null}')
print(f'\nFood Duplicate: {food_dupli}')


Food Null: Food_ID          0
Food_Name        0
Quantity         0
Expiry_Date      0
Provider_ID      0
Provider_Type    0
Location         0
Food_Type        0
Meal_Type        0
dtype: int64

Food Duplicate: 0


In [27]:
# Make mapping keys lowercase for consistency
food_category_mapping = {
    'bread': 'Vegetarian',       # if dairy-free, else Vegetarian
    'vegetables': 'Vegan',
    'pasta': 'Vegetarian',       # if egg-free, else Vegetarian
    'soup': 'Vegetarian',        # if no dairy/broth, else Vegetarian
    'salad': 'Vegan',
    'rice': 'Vegan',
    'fruits': 'Vegan',
    'dairy': 'Vegetarian',
    'fish': 'Non Vegetarian',
    'chicken': 'Non Vegetarian'
}


# Clean 'Food_Type' – replace hyphens with spaces
if 'Food_Type' in food_df.columns:
    food_df['Food_Type'] = food_df['Food_Type'].astype(str).replace(r'[-]', ' ', regex=True)
    
# Clean 'Location' – remove newlines and commas
if 'Location' in food_df.columns:
    food_df['Location'] = food_df['Location'].astype(str).replace(r'[\n,]', ' ', regex=True)

# Clean 'Food_Name' – remove newlines and commas
if 'Food_Name' in food_df.columns:
    food_df['Food_Name'] = food_df['Food_Name'].astype(str).replace(r'[\n,]', ' ', regex=True)

# Clean and format 'Expiry_Date'
if 'Expiry_Date' in food_df.columns:
    food_df['Expiry_Date'] = pd.to_datetime(
        food_df['Expiry_Date'],
        dayfirst=True,
        errors='coerce'  # invalid dates become NaT
    ).dt.strftime('%d-%m-%Y')

# --- Auto-fill Food_Type using partial match from Food_Name ---
def map_food_type(name):
    if pd.isna(name):
        return None
    name = str(name).lower()
    for key, value in food_category_mapping.items():
        if key in name:  # partial match
            return value
    return None

if 'Food_Type' in food_df.columns and 'Food_Name' in food_df.columns:
    food_df['Food_Type'] = food_df['Food_Name'].apply(map_food_type)

# Lowercase all column names for DB compatibility
food_df.columns = food_df.columns.str.lower()

# Save cleaned data with UTF-8 encoding
food_df.to_csv('food_data_cleaned.csv', index=False, encoding='utf-8')

# Reload for validation
food_dup = pd.read_csv('food_data_cleaned.csv', encoding='utf-8')
print(f'\nFood Duplicate rows: {food_dup.duplicated().sum()}')
print(f'\nFood Null counts:\n{food_dup.isnull().sum()}')


Food Duplicate rows: 0

Food Null counts:
food_id          0
food_name        0
quantity         0
expiry_date      0
provider_id      0
provider_type    0
location         0
food_type        0
meal_type        0
dtype: int64


  food_df['Expiry_Date'] = pd.to_datetime(


In [82]:
claim_df = pd.read_csv(filepath3)

claim_null = claim_df.isnull().sum()
claim_dupli = claim_df.duplicated().sum()

print(f'\nClaim Null: {claim_null}')
print(f'\nClaim Duplicate: {claim_dupli}')


Claim Null: Claim_ID       0
Food_ID        0
Receiver_ID    0
Status         0
Timestamp      0
dtype: int64

Claim Duplicate: 0


In [89]:
# Clean and format 'Timestamp'
if 'Timestamp' in claim_df.columns:
    claim_df['Timestamp'] = pd.to_datetime(
        claim_df['Timestamp'],
        dayfirst=True,
        errors='coerce'  # invalid dates become NaT
    ).dt.strftime('%d-%m-%Y %H:%M')


claim_df.columns = claim_df.columns.str.lower()
claim_df.to_csv('claim_data_cleaned.csv', index=False)


claim_dup = pd.read_csv('claim_data_cleaned.csv')
print(f'\nClaim Duplicate: {claim_dup.duplicated().sum()}')
print(f'\nClaim Null: {claim_dup.isnull().sum()}')



Claim Duplicate: 0

Claim Null: claim_id       0
food_id        0
receiver_id    0
status         0
timestamp      0
dtype: int64
