In [None]:
# import pandas as pd
# from google.colab import drive

# drive.mount('/content/drive')
# base = "/content/drive/MyDrive/FIT5196Assignment2/" # for colab

In [1]:
# begin here if running locally

# for local drive
base = ""

In [2]:

import pandas as pd
warehouses = pd.read_csv(base + 'warehouses.csv')

In [None]:


dirty_data = pd.read_csv(base + 'Group_035_dirty_data.csv')
print(dirty_data)

In [3]:

missing_data = pd.read_csv(base + 'Group_035_missing_data.csv')
print(missing_data.isna().sum())



order_id                          0
customer_id                       0
date                              0
nearest_warehouse                55
shopping_cart                     0
order_price                      15
delivery_charges                 40
customer_lat                      0
customer_long                     0
coupon_discount                   0
order_total                      15
season                            0
is_expedited_delivery             0
distance_to_nearest_warehouse    31
latest_customer_review            0
is_happy_customer                40
dtype: int64


In [4]:
import ast
import numpy as np

# Parse shopping_cart into list of tuples
def parse_cart(cart_str):
    if pd.isna(cart_str):
        return []
    try:
        return ast.literal_eval(cart_str)
    except Exception:
        return []

missing_data['parsed_cart'] = missing_data['shopping_cart'].apply(parse_cart)

# Collect unique items
all_items = sorted({item for cart in missing_data['parsed_cart'] for item, _ in cart})
item_index = {item: i for i, item in enumerate(all_items)}

# Build system of equations A x = b
rows, b = [], []
for _, row in missing_data.iterrows():
    if pd.notna(row['order_price']) and row['parsed_cart']:
        vec = np.zeros(len(all_items))
        for item, qty in row['parsed_cart']:
            vec[item_index[item]] += qty
        rows.append(vec)
        b.append(row['order_price'])

A = np.vstack(rows)
b = np.array(b)

# Solve least squares for unit prices
x, _, _, _ = np.linalg.lstsq(A, b, rcond=None)

# Round to 2 decimals for catalog
x = np.round(x, 2)

# Build catalog
catalog = {item: price for item, price in zip(all_items, x)}


In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# --- Haversine helper ---
def haversine(lat1, lon1, lat2, lon2):
    R = 6378  # Earth radius in KM
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# --- Prepare Sentiment analyzer ---
sia = SentimentIntensityAnalyzer()

# --- Imputation logic ---
def impute_row(row, warehouses, catalog):
    # --- Impute nearest_warehouse ---
    if pd.isna(row['nearest_warehouse']) and pd.notna(row['customer_lat']) and pd.notna(row['customer_long']):
        distances = warehouses.apply(
            lambda wh: haversine(row['customer_lat'], row['customer_long'], wh['lat'], wh['lon']), axis=1
        )
        nearest_idx = distances.idxmin()
        row['nearest_warehouse'] = warehouses.loc[nearest_idx, 'names']
        row['distance_to_nearest_warehouse'] = distances.min()

    # --- Impute order_price ---
    if pd.isna(row['order_price']):
        if pd.notna(row['order_total']) and pd.notna(row['delivery_charges']) and pd.notna(row['coupon_discount']):
            denom = (100 - row['coupon_discount']) / 100
            if denom != 0:
                row['order_price'] = (row['order_total'] - row['delivery_charges']) / denom
        elif pd.isna(row['order_total']) and row['parsed_cart']:  # fallback to catalog
            row['order_price'] = sum(catalog.get(item, 0) * qty for item, qty in row['parsed_cart'])

    # --- Impute delivery_charges ---
    if pd.isna(row['delivery_charges']):
        if pd.notna(row['order_total']) and pd.notna(row['order_price']) and pd.notna(row['coupon_discount']):
            denom = (100 - row['coupon_discount']) / 100
            row['delivery_charges'] = row['order_total'] - row['order_price'] * denom

    # --- Impute order_total ---
    if pd.isna(row['order_total']):
        if pd.notna(row['order_price']) and pd.notna(row['coupon_discount']) and pd.notna(row['delivery_charges']):
            denom = (100 - row['coupon_discount']) / 100
            row['order_total'] = row['order_price'] * denom + row['delivery_charges']

    # --- Impute distance_to_nearest_warehouse ---
    if pd.isna(row['distance_to_nearest_warehouse']):
        if pd.notna(row['nearest_warehouse']) and pd.notna(row['customer_lat']) and pd.notna(row['customer_long']):
            wh = warehouses.loc[warehouses['names'] == row['nearest_warehouse']].iloc[0]
            row['distance_to_nearest_warehouse'] = haversine(row['customer_lat'], row['customer_long'], wh['lat'], wh['lon'])

    # --- Impute is_happy_customer from sentiment---
    if pd.isna(row['is_happy_customer']) and pd.notna(row['latest_customer_review']):
        sentiment = sia.polarity_scores(str(row['latest_customer_review']))
        row['is_happy_customer'] = sentiment['compound'] >= 0.05

    return row

# Apply
missing_data_imputed = missing_data.apply(lambda r: impute_row(r, warehouses, catalog), axis=1)


In [None]:

print(missing_data_imputed.isna().sum())
rows_with_nulls = missing_data_imputed[missing_data_imputed.isnull().any(axis=1)]
print(rows_with_nulls)

print(catalog)

order_id                         0
customer_id                      0
date                             0
nearest_warehouse                0
shopping_cart                    0
order_price                      0
delivery_charges                 0
customer_lat                     0
customer_long                    0
coupon_discount                  0
order_total                      0
season                           0
is_expedited_delivery            0
distance_to_nearest_warehouse    0
latest_customer_review           0
is_happy_customer                0
parsed_cart                      0
dtype: int64
{'Alcon 10': np.float64(8950.0), 'Candle Inferno': np.float64(430.0), 'Lucent 330S': np.float64(1230.0), 'Olivia x460': np.float64(1225.0), 'Thunder line': np.float64(2180.0), 'Toshika 750': np.float64(4320.0), 'Universe Note': np.float64(3450.0), 'iAssist Line': np.float64(2225.0), 'iStream': np.float64(150.0), 'pearTV': np.float64(6310.0)}


I have two pandas datasets, warehouses and missing_data, with the following columns:
1. warehouses has the following columns:
names,lat,lon

2. missing_data has the following columns:
order_id,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer

I want to perform data imputation. The columns which have missing entries, and the imputing strategy for each column, are:
nearest_warehouse: compute the lowest Haversine Distance (with radius of earth = 6378 KM) between the customer_lat, customer_long and each of the lat, lon pairs in warehouses, and impute with the "names" of the lowest Haversine Distance.
order_price: (order_total - delivery_charges)/((100-coupon_discount)/100)
delivery_charges: order_total - order_price*((100-coupon_discount)/100)
order_total: order_price*((100-coupon_discount)/100) + delivery_charges
distance_to_nearest_warehouse: Haversine Distance (with radius of earth = 6378 KM) of customer_lat, customer_long vs the lat, lon of nearest_warehouse as found in warehouses
is_happy_customer: check latest_customer_review with SentimentIntensityAnalyzer from 
nltk.sentiment.vader to obtain the polarity score. A sentiment is considered positive if it has a 'compound' polarity score of 0.05 or higher and is considered negative otherwise.

If any row has too many missing columns to do any of the above imputation, do not impute it so that i can later see what's incomplete.


shopping_cart, a column in missing_data is a list of tuples, of (Item_ordered, Quantity), example: "[('Item 330S', 1), ('sampleItem', 2)]". 

there are 10 categories of items. using the column shopping_cart, first identify the 10 categories of items, then use numpy.linalg to determine the unit price of each item.

store the values of this in a dictionary named catalog.

in the case that both order_price and order_total are missing, order_price should be imputed from shopping_cart. use the number of each type of item ordered in shopping_cart, and, referencing the values found in catalog calculated earlier, impute order_price from this. order_total then can be imputed as normal.




In [None]:

outlier_data = pd.read_csv(base + 'Group_035_outlier_data.csv')
print(outlier_data)
