# FIT5196 Assessment 2


## Task 1. Data Cleansing

### 1.1 Import Data

In [1]:
# from google.colab import drive

# drive.mount('/content/drive')
# base = "/content/drive/MyDrive/FIT5196Assignment2/" # for colab

In [2]:
# Begin here if running locally

# for local drive
base = ""

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import ast
from math import radians, cos, sin, asin, sqrt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# --- Prepare Sentiment analyzer ---
sia = SentimentIntensityAnalyzer()

# Warehouse data
warehouse_data = pd.read_csv(base + 'warehouses.csv')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# Dirty data
dirty_data = pd.read_csv(base + 'Group_035_dirty_data.csv')

In [5]:
# Missing data
missing_data = pd.read_csv(base + 'Group_035_missing_data.csv')

In [6]:
# Outlier data
outlier_data = pd.read_csv(base + 'Group_035_outlier_data.csv')

In [7]:
# --- Haversine helper ---
def haversine_dist(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance between two points on the Earth's surface.
    """
    R = 6378  # Earth radius in KM
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c


### 1.2 Dirty Data

#### 1.2.1 Dirty Data EDA

In [8]:
# Check all the data type
print(dirty_data.dtypes)

order_id                          object
customer_id                       object
date                              object
nearest_warehouse                 object
shopping_cart                     object
order_price                        int64
delivery_charges                 float64
customer_lat                     float64
customer_long                    float64
coupon_discount                    int64
order_total                      float64
season                            object
is_expedited_delivery               bool
distance_to_nearest_warehouse    float64
latest_customer_review            object
is_happy_customer                   bool
dtype: object


In [9]:
# Summary of dirty data
dirty_data.describe(include = "all")

Unnamed: 0,order_id,customer_id,date,nearest_warehouse,shopping_cart,order_price,delivery_charges,customer_lat,customer_long,coupon_discount,order_total,season,is_expedited_delivery,distance_to_nearest_warehouse,latest_customer_review,is_happy_customer
count,500,500,500,500,500,500.0,500.0,500.0,500.0,500.0,500.0,500,500,500.0,499,500
unique,500,493,304,6,460,,,,,,,8,2,,499,2
top,ORD089659,ID0443530293,2019-05-25,Thompson,"[('Lucent 330S', 1), ('iAssist Line', 1)]",,,,,,,Summer,True,,great value hard to beat for the price.,True
freq,1,2,6,201,4,,,,,,,128,258,,1,372
mean,,,,,,13864.58,78.03112,-27.942558,135.095643,10.74,12598.74034,,,1.06467,,
std,,,,,,7781.162606,14.400599,41.353147,41.351948,8.449567,6961.477582,,,0.492255,,
min,,,,,,1375.0,46.4,-37.831769,-37.827219,0.0,1450.55,,,0.0319,,
25%,,,,,,7900.0,66.695,-37.81862,144.94883,5.0,7405.7,,,0.72735,,
50%,,,,,,12260.0,76.82,-37.812261,144.962138,10.0,11111.125,,,1.02575,,
75%,,,,,,19260.0,85.8025,-37.805632,144.978927,15.0,16934.4225,,,1.3592,,


#### 1.2.1.1 Individual Column EDA

In [10]:
# date
dirty_data["date"] = pd.to_datetime(dirty_data["date"], errors="coerce")

# Invalid date
print(f"Invalid date: {dirty_data['date'].isna().sum()}")
invalid_date = dirty_data[dirty_data["date"].isna()]
print(invalid_date[["order_id", "customer_id", "date", "season"]].to_string(index=False))

# Date distribution
valid_dates = dirty_data["date"].dropna()
print(f"Min date: {valid_dates.min()}")
print(f"Max date: {valid_dates.max()}")
print(f"Number of valid dates: {valid_dates.count()}")
print(f"Number of unique dates: {valid_dates.nunique()}")

Invalid date: 27
 order_id  customer_id date season
ORD164387 ID0289597227  NaT Summer
ORD066446 ID0145235264  NaT Winter
ORD312565 ID0638050574  NaT Summer
ORD181051 ID0709970691  NaT Summer
ORD046408 ID5402876538  NaT Spring
ORD219265 ID0055722470  NaT Winter
ORD006455 ID0634777174  NaT Spring
ORD084861 ID3094966833  NaT Winter
ORD438655 ID2705184152  NaT Summer
ORD234563 ID2383211199  NaT Spring
ORD199817 ID0650275823  NaT Summer
ORD113549 ID0634780047  NaT Autumn
ORD489756 ID0846548135  NaT Spring
ORD273300 ID0052599838  NaT Spring
ORD194653 ID0575428932  NaT Summer
ORD480775 ID4655129040  NaT Spring
ORD160619 ID0746917821  NaT Winter
ORD402436 ID2621587173  NaT Autumn
ORD311888 ID1463620717  NaT Autumn
ORD491911 ID2237521759  NaT Winter
ORD265708 ID1889073821  NaT Summer
ORD060082 ID0576834725  NaT Summer
ORD469475 ID0767665017  NaT Summer
ORD461231 ID0126934555  NaT Winter
ORD047863 ID0638044384  NaT Autumn
ORD499923 ID6197211200  NaT Spring
ORD036565 ID0616939377  NaT Spring
Min

In [11]:
# nearest_warehouse
# Invalid nearest_warehouse names
invalid_warehouse_name = dirty_data[dirty_data["nearest_warehouse"].isna()]
print("Invalid season found:", len(invalid_warehouse_name))

# Unique values
print(dirty_data["nearest_warehouse"].unique())

# Invalid nearest_warehouse value
invalid_warehouses = dirty_data[~dirty_data["nearest_warehouse"].isin(warehouse_data["names"])]
print("Number of invalid nearest warehouse names:", len(invalid_warehouses))
print(dirty_data["nearest_warehouse"].value_counts(dropna=False))
print(invalid_warehouses[["order_id", "nearest_warehouse"]].to_string(index=False))

Invalid season found: 0
['Thompson' 'Nickolson' 'Bakers' 'thompson' 'nickolson' 'bakers']
Number of invalid nearest warehouse names: 20
nearest_warehouse
Thompson     201
Nickolson    180
Bakers        99
thompson       9
nickolson      7
bakers         4
Name: count, dtype: int64
 order_id nearest_warehouse
ORD256861          thompson
ORD014442          thompson
ORD052629         nickolson
ORD166717          thompson
ORD461915         nickolson
ORD461426            bakers
ORD015774         nickolson
ORD169718         nickolson
ORD120949            bakers
ORD138742            bakers
ORD025929            bakers
ORD393258          thompson
ORD201338         nickolson
ORD224300         nickolson
ORD016571         nickolson
ORD471229          thompson
ORD164906          thompson
ORD118183          thompson
ORD300512          thompson
ORD099672          thompson


In [12]:
# shopping_cart (item ordered)
# Extract the necessary columns
shopping_cart_check = dirty_data[["order_id", "shopping_cart", "order_total"]].copy()

# Parses the shopping_cart value to get the item ordered
shopping_cart_check["shopping_cart_parsed"] = shopping_cart_check["shopping_cart"].apply(ast.literal_eval)
branded_items = [item for cart in shopping_cart_check["shopping_cart_parsed"] for (item, qty) in cart]

# Invalid value
invalid_cart = shopping_cart_check[shopping_cart_check["shopping_cart"].isna() | shopping_cart_check["shopping_cart"].eq("[]")]
print("Number of invalid values:", len(invalid_cart))
invalid_items = [item for item in branded_items if pd.isna(item) or item is None]
print("Number of invalid items:", len(invalid_items))
invalid_duplicates = shopping_cart_check[shopping_cart_check["shopping_cart_parsed"].apply(lambda cart: len({item for item, qty in cart}) != len(cart))]
print("Number of duplicated items:", len(invalid_duplicates))

# Count and value of unique branded items
unique_branded_items = pd.Series(branded_items).unique()
print(f"Number of unique branded items:", len(unique_branded_items))
print(f"Unique branded items:", unique_branded_items)

# Frequency distribution
print(pd.Series(branded_items).value_counts(dropna=False))

Number of invalid values: 0
Number of invalid items: 0
Number of duplicated items: 0
Number of unique branded items: 10
Unique branded items: ['iAssist Line' 'Alcon 10' 'Universe Note' 'pearTV' 'Toshika 750'
 'Candle Inferno' 'Thunder line' 'iStream' 'Olivia x460' 'Lucent 330S']
iAssist Line      169
Toshika 750       163
Lucent 330S       154
Alcon 10          153
Thunder line      151
pearTV            147
Candle Inferno    147
Olivia x460       146
iStream           141
Universe Note     134
Name: count, dtype: int64


In [13]:
# Longitude: customer_lat, customer_long
invalid_coords = dirty_data[dirty_data["customer_lat"].isna() |
                            dirty_data["customer_long"].isna() |
                            ~dirty_data["customer_lat"].between(-90, 90) |
                            ~dirty_data["customer_long"].between(-180, 180)]
print("Invalid coordinates found:", len(invalid_coords))
print(invalid_coords[["order_id", "customer_id", "customer_lat", "customer_long"]].to_string(index=False))

Invalid coordinates found: 27
 order_id  customer_id  customer_lat  customer_long
ORD091929 ID0145235237    144.959364     -37.815878
ORD299508 ID0260907252    145.009445     -37.823816
ORD074143 ID6167247310    144.960234     -37.819701
ORD392203 ID0588197234    144.973944     -37.812101
ORD208957 ID6245731092    144.977354     -37.810785
ORD090831 ID1519470918    144.993262     -37.797624
ORD062280 ID6167489480    144.961790     -37.816990
ORD155978 ID0452381032    144.949411     -37.824991
ORD493957 ID0361227457    144.976899     -37.801182
ORD083244 ID0577458190    144.977813     -37.818479
ORD373348 ID1888340704    144.985178     -37.793879
ORD055195 ID2399230968    144.983469     -37.806415
ORD125480 ID2190483590    144.961303     -37.810368
ORD285476 ID2141904233    144.935968     -37.802954
ORD349254 ID0387153047    144.947587     -37.805420
ORD048052 ID0207093528    145.004517     -37.801171
ORD479919 ID1449297346    144.977507     -37.815613
ORD326763 ID0581709069    144.9282

In [14]:
# distance_to_nearest_warehouse
invalid_distance = dirty_data[dirty_data["distance_to_nearest_warehouse"].isna() |
                              dirty_data["distance_to_nearest_warehouse"] <= 0]
print("Invalid distance found:", len(invalid_distance))

Invalid distance found: 0


In [15]:
# order_total
invalid_total = dirty_data[dirty_data["order_total"].isna() |
                           dirty_data["order_total"] <= 0 |
                           (dirty_data["order_total"].apply(lambda x: len(str(x).split('.')[-1]) > 2 if '.' in str(x) else False))]
print("Invalid order total found:", len(invalid_total))

Invalid order total found: 0


In [16]:
# season
# Invalid season names
invalid_season = dirty_data[dirty_data["season"].isna()]
print("Invalid season found:", len(invalid_season))

# Unique values
print(dirty_data["season"].unique())

# Invalid letter case
invalid_case_season = dirty_data[~dirty_data["season"].str.match(r"^[A-Z][a-z]+$", na=False)]
print("Number of seasons not capitalised correctly:", len(invalid_case_season))
print(dirty_data["season"].value_counts(dropna=False))
print(invalid_case_season[["order_id", "season"]].to_string(index=False))

Invalid season found: 0
['Winter' 'Summer' 'spring' 'Autumn' 'Spring' 'autumn' 'winter' 'summer']
Number of seasons not capitalised correctly: 21
season
Summer    128
Winter    124
Spring    120
Autumn    107
spring      9
summer      6
autumn      3
winter      3
Name: count, dtype: int64
 order_id season
ORD209240 spring
ORD277275 spring
ORD123382 autumn
ORD209230 winter
ORD132264 spring
ORD192930 spring
ORD172628 spring
ORD103002 summer
ORD344249 summer
ORD222038 summer
ORD026633 summer
ORD455600 summer
ORD468030 spring
ORD478782 winter
ORD345384 summer
ORD493849 spring
ORD492808 winter
ORD385926 spring
ORD462038 autumn
ORD434426 spring
ORD387883 autumn


In [17]:
# is_expedited_delivery
invalid_expedited = dirty_data[dirty_data["is_expedited_delivery"].isna()]
print("Invalid expedited delivery found:", len(invalid_expedited))
print(dirty_data["is_expedited_delivery"].value_counts(dropna=False))

Invalid expedited delivery found: 0
is_expedited_delivery
True     258
False    242
Name: count, dtype: int64


In [18]:
# is_happy_customer
invalid_happy = dirty_data[dirty_data["is_happy_customer"].isna()]
print("Invalid positive customer response found:", len(invalid_happy))
print(dirty_data["is_happy_customer"].value_counts(dropna=False))

Invalid positive customer response found: 0
is_happy_customer
True     372
False    128
Name: count, dtype: int64


#### 1.2.1.2 Cross-Column EDA

In [19]:
# Check date and season pairing
# Extract the necessary columns and exclude NA values
date_season = dirty_data[dirty_data["date"].notna()][["order_id", "date", "season"]].copy()

# Standardise season name
date_season["season_clean"] = date_season["season"].str.strip().str.lower()

# Get the months from date column
date_season["month"] = date_season["date"].dt.month

# Mapping of months to seasons
season_months = {"summer": [12, 1, 2],
                 "autumn": [3, 4, 5],
                 "winter": [6, 7, 8],
                 "spring": [9, 10, 11]}

# Valid season check
def valid_season (row):
  season = row["season_clean"]
  month = row["month"]
  if season in season_months:
    return month in season_months[season]
  else:
    return False

# Apply valid season check to the months and season
date_season["season_match"] = date_season.apply(valid_season, axis=1)

invalid_date_season = date_season[~date_season["season_match"]]
invalid_case = invalid_date_season[~invalid_date_season["season"].str.match(r"^[A-Z][a-z]+$", na=False)]

print("Number of mismatched date-season pairs:", len(invalid_date_season))
print("Number of seasons not capitalised correctly in date-season pairs:", len(invalid_case))
print(invalid_date_season[["order_id", "date", "month", "season"]].to_string(index=False))

Number of mismatched date-season pairs: 21
Number of seasons not capitalised correctly in date-season pairs: 15
 order_id       date  month season
ORD209240 2019-02-07      2 spring
ORD419503 2019-09-21      9 Autumn
ORD277275 2019-12-26     12 spring
ORD123382 2019-01-06      1 autumn
ORD209230 2019-09-03      9 winter
ORD192930 2019-08-30      8 spring
ORD172628 2019-03-30      3 spring
ORD103002 2019-10-17     10 summer
ORD040501 2019-10-17     10 Summer
ORD344249 2019-10-23     10 summer
ORD222038 2019-11-04     11 summer
ORD377228 2019-09-24      9 Autumn
ORD468030 2019-05-06      5 spring
ORD363647 2019-12-01     12 Winter
ORD345384 2019-11-25     11 summer
ORD127439 2019-07-25      7 Spring
ORD493849 2019-04-24      4 spring
ORD495324 2019-07-10      7 Summer
ORD462038 2019-07-10      7 autumn
ORD434426 2019-06-14      6 spring
ORD387883 2019-10-20     10 autumn


In [20]:
# Check if distance_to_nearest_warehouse is correct using Haversine distance
# Extract the necessary columns
warehouse_dist = dirty_data[["order_id", "nearest_warehouse", "distance_to_nearest_warehouse", "customer_lat", "customer_long"]].copy()

# Standardise warehouses name
warehouse_dist["nearest_warehouse_clean"] = warehouse_dist["nearest_warehouse"].str.strip().str.title()

# Swap the invalid customer_lat and customer_long
invalid_coords_tmp = warehouse_dist["customer_lat"].between(-180, 180) & warehouse_dist["customer_long"].between(-90, 90)
warehouse_dist.loc[invalid_coords_tmp, ["customer_lat", "customer_long"]] = warehouse_dist.loc[invalid_coords_tmp, ["customer_long", "customer_lat"]].values

# Check warehouse_data data types
print(warehouse_data.dtypes)

# Merge both datasets
merge_warehouse_dist = warehouse_dist.merge(warehouse_data,
                                            left_on="nearest_warehouse_clean",
                                            right_on="names",
                                            how="left")

# Apply Haversine distance function to the coordinates
merge_warehouse_dist["haversine_dist"] = merge_warehouse_dist.apply(lambda row:
                                                                    haversine_dist(row["customer_lat"],
                                                                             row["customer_long"],
                                                                             row["lat"],
                                                                             row["lon"]),
                                                                    axis=1)

# Compare the distances
merge_warehouse_dist["dist_diff"] = (merge_warehouse_dist["haversine_dist"] - merge_warehouse_dist["distance_to_nearest_warehouse"]).abs()

invalid_distance = merge_warehouse_dist[merge_warehouse_dist["dist_diff"] > 0.1]
print("Number of mismatched distances:", len(invalid_distance))
print(invalid_distance[["order_id", "nearest_warehouse_clean", "customer_lat", "customer_long", "lat", "lon", "distance_to_nearest_warehouse", "haversine_dist", "dist_diff"]].to_string(index=False))

names     object
lat      float64
lon      float64
dtype: object
Number of mismatched distances: 43
 order_id nearest_warehouse_clean  customer_lat  customer_long        lat        lon  distance_to_nearest_warehouse  haversine_dist  dist_diff
ORD418280               Nickolson    -37.822991     144.976257 -37.818595 144.969551                         1.9037        0.766301   1.137399
ORD237879               Nickolson    -37.800113     144.935310 -37.818595 144.969551                         1.7391        3.647097   1.907997
ORD020055                  Bakers    -37.817347     145.008734 -37.809996 144.995232                         0.6764        1.442062   0.765662
ORD483341                Thompson    -37.820155     144.952118 -37.812673 144.947069                         0.7995        0.943856   0.144356
ORD106152               Nickolson    -37.820522     144.982869 -37.818595 144.969551                         0.7663        1.190577   0.424277
ORD052629               Nickolson    -37.8

In [21]:
# Check if nearest_warehouse is correct using Haversine distance
# Calculate the distance to all three warehouses
for _, row in warehouse_data.iterrows():
  wh = row["names"]
  lat = row["lat"]
  lon = row["lon"]

  merge_warehouse_dist[f"haversine_dist_{wh}"] = merge_warehouse_dist.apply(lambda row:
                                                                    haversine_dist(row["customer_lat"],
                                                                             row["customer_long"],
                                                                             lat,
                                                                             lon),
                                                                  axis=1)

# Get the nearest warehouse name
distance_cols = [f"haversine_dist_{wh}" for wh in warehouse_data["names"]]
merge_warehouse_dist[distance_cols] = merge_warehouse_dist[distance_cols].apply(lambda x: np.around(x, 4))
merge_warehouse_dist["actual_nearest_distance"] = merge_warehouse_dist[distance_cols].min(axis=1)
merge_warehouse_dist["actual_nearest_warehouse"] = (merge_warehouse_dist[distance_cols].idxmin(axis=1).str.replace("haversine_dist_", ""))

# Check if the nearest_warehouse is correct
merge_warehouse_dist["is_correct"] = (merge_warehouse_dist["nearest_warehouse_clean"] == merge_warehouse_dist["actual_nearest_warehouse"])

invalid_nearest_warehouse = merge_warehouse_dist[~merge_warehouse_dist["is_correct"]]
print("Number of invalid nearest warehouse:", len(invalid_nearest_warehouse))
print(invalid_nearest_warehouse[["order_id", "nearest_warehouse_clean", "distance_to_nearest_warehouse", "actual_nearest_warehouse", "actual_nearest_distance", "haversine_dist_Thompson", "haversine_dist_Nickolson", "haversine_dist_Bakers"]].to_string(index=False))

# Check if distance_to_nearest_warehouse is exactly the same as actual_nearest_distance
distance_check = invalid_nearest_warehouse["distance_to_nearest_warehouse"] == invalid_nearest_warehouse["actual_nearest_distance"]
print("Number of incorrect nearest warehouse distance:", len(distance_check[~distance_check]))

Number of invalid nearest warehouse: 20
 order_id nearest_warehouse_clean  distance_to_nearest_warehouse actual_nearest_warehouse  actual_nearest_distance  haversine_dist_Thompson  haversine_dist_Nickolson  haversine_dist_Bakers
ORD237879               Nickolson                         1.7391                 Thompson                   1.7391                   1.7391                    3.6471                 5.3839
ORD052629               Nickolson                         1.3297                   Bakers                   1.3297                   4.6427                    2.5795                 1.3297
ORD461915               Nickolson                         0.2395                 Thompson                   0.2395                   0.2395                    1.8613                 4.0736
ORD069936                Thompson                         1.3319                Nickolson                   1.3319                   1.7608                    1.3319                 2.6129
ORD461426      

In [22]:
# Check the overlapping order_id
setA = set(invalid_warehouses["order_id"]) # inconsistent nearest_warehouse naming
setB = set(invalid_nearest_warehouse["order_id"]) # incorrect nearest_warehouse
setC = set(invalid_distance["order_id"]) # incorrect distance_to_nearest_warehouse

# Number of invalid data in A, B, and C
print(f"|A|={len(setA)}  |B|={len(setB)}  |C|={len(setC)}")

# Exclusive
onlyA = setA - (setB | setC)
onlyB = setB - (setA | setC)
onlyC = setC - (setA | setB)

print("\nExclusive to A:", len(onlyA))
print(sorted(list(onlyA)))
print("\nExclusive to B:", len(onlyB))
print(sorted(list(onlyB)))
print("\nExclusive to C:", len(onlyC))
print(sorted(list(onlyC)))

# Overlapping
overlap_AB = (setA & setB) - setC
overlap_AC = (setA & setC) - setB
overlap_BC = (setB & setC) - setA
overlap_ABC = setA & setB & setC
print("\nOverlap A∩B only:", len(overlap_AB))
print(sorted(list(overlap_AB)))
print("\nOverlap A∩C only:", len(overlap_AC))
print(sorted(list(overlap_AC)))
print("\nOverlap B∩C only:", len(overlap_BC))
print(sorted(list(overlap_BC)))
print("\nOverlap A∩B∩C:", len(overlap_ABC))
print(sorted(list(overlap_ABC)))

|A|=20  |B|=20  |C|=43

Exclusive to A: 7
['ORD014442', 'ORD016571', 'ORD099672', 'ORD166717', 'ORD169718', 'ORD256861', 'ORD393258']

Exclusive to B: 0
[]

Exclusive to C: 23
['ORD005273', 'ORD020055', 'ORD041333', 'ORD048269', 'ORD049973', 'ORD106152', 'ORD135624', 'ORD175303', 'ORD177750', 'ORD205213', 'ORD223207', 'ORD227802', 'ORD307211', 'ORD321986', 'ORD378423', 'ORD385052', 'ORD393540', 'ORD416698', 'ORD418280', 'ORD440634', 'ORD452867', 'ORD471922', 'ORD483341']

Overlap A∩B only: 0
[]

Overlap A∩C only: 0
[]

Overlap B∩C only: 7
['ORD069936', 'ORD081635', 'ORD216010', 'ORD237879', 'ORD334316', 'ORD426908', 'ORD436885']

Overlap A∩B∩C: 13
['ORD015774', 'ORD025929', 'ORD052629', 'ORD118183', 'ORD120949', 'ORD138742', 'ORD164906', 'ORD201338', 'ORD224300', 'ORD300512', 'ORD461426', 'ORD461915', 'ORD471229']


In [23]:
# Extract the necessary columns and replace missing values with empty string
is_happy_check = dirty_data[["order_id", "is_happy_customer", "latest_customer_review"]].copy()
is_happy_check["latest_customer_review"] = is_happy_check["latest_customer_review"].fillna("").astype(str)

# Apply sentiment analysis to each row of latest_review_customer and return the compound_score but None if it is an empty string
is_happy_check["compound_score"] = is_happy_check["latest_customer_review"].apply(lambda row: sia.polarity_scores(row)["compound"]
                                                                                  if row.strip() != ""
                                                                                  else None)

# Create a column to store True when compound_score is more than 0.05 or is an empty string
is_happy_check["happy_prediction"] = is_happy_check.apply(lambda row: True
                                                          if row["latest_customer_review"].strip() == ""
                                                          else row["compound_score"] >= 0.05,
                                                          axis=1)

# Check if is_happy_customer aligns with latest_review_customer
is_happy_check["is_correct"] = is_happy_check["happy_prediction"] == is_happy_check["is_happy_customer"]

invalid_sentiment = is_happy_check[~is_happy_check["is_correct"]]
print("Number of invalid sentiment labels", len(invalid_sentiment))
print(invalid_sentiment[["order_id", "latest_customer_review", "is_happy_customer", "compound_score", "happy_prediction", "is_correct"]].to_string(index=False))

Number of invalid sentiment labels 27
 order_id                                                                                                                                                                                                                                                      latest_customer_review  is_happy_customer  compound_score  happy_prediction  is_correct
ORD216249                                                                                                          battery runs low fast the phone works fine, although the battery runs out fast and you have to charge it a lot .i like the phone but wish the battery didn't suck.              False          0.8052              True       False
ORD412492                                                                                                                                                                                                                                      nice i don't see any problems with it

#### 1.2.1.3 Dirty Data Fix

In [24]:
# Create a duplicate of dirty_data to clean
clean_data = dirty_data.copy()

# Create fix log
fix_log = pd.DataFrame(columns=["order_id", "error_type"])

In [25]:
# Fix season and casing based on date
# Filter out the unfixed data
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])]

unfixed_data["date"] = pd.to_datetime(unfixed_data["date"], errors="coerce")
unfixed_data["month"]= unfixed_data["date"].dt.month

season_map = {12: "Summer", 1: "Summer", 2: "Summer",
              3: "Autumn", 4: "Autumn", 5: "Autumn",
              6: "Winter", 7: "Winter", 8: "Winter",
              9: "Spring", 10: "Spring", 11: "Spring"}

unfixed_data["season_clean"] = unfixed_data["month"].map(season_map)

unfixed_data["season_clean"] = np.where(
    unfixed_data["month"].isna(),
    unfixed_data["season"],
    unfixed_data["season_clean"]
)

season_mismatch = (unfixed_data["month"].notna() &
                   (unfixed_data["season"] != unfixed_data["season_clean"]))
print("Number of mismatched seasons:", season_mismatch.sum())

clean_data.loc[season_mismatch, "season"] = unfixed_data.loc[season_mismatch, "season_clean"]

fixed_ids = unfixed_data.loc[season_mismatch, "order_id"]
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Incorrect season"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[unfixed_data.loc[season_mismatch].index, ["order_id", "date", "season"]].to_string(index=False))

Number of mismatched seasons: 27
Fixed 27 rows.
 order_id       date season
ORD209240 2019-02-07 Summer
ORD419503 2019-09-21 Spring
ORD277275 2019-12-26 Summer
ORD123382 2019-01-06 Summer
ORD209230 2019-09-03 Spring
ORD132264 2019-10-14 Spring
ORD192930 2019-08-30 Winter
ORD172628 2019-03-30 Autumn
ORD103002 2019-10-17 Spring
ORD040501 2019-10-17 Spring
ORD344249 2019-10-23 Spring
ORD222038 2019-11-04 Spring
ORD026633 2019-01-16 Summer
ORD377228 2019-09-24 Spring
ORD455600 2019-02-18 Summer
ORD468030 2019-05-06 Autumn
ORD478782 2019-07-26 Winter
ORD363647 2019-12-01 Summer
ORD345384 2019-11-25 Spring
ORD127439 2019-07-25 Winter
ORD493849 2019-04-24 Autumn
ORD492808 2019-08-07 Winter
ORD495324 2019-07-10 Winter
ORD385926 2019-09-10 Spring
ORD462038 2019-07-10 Winter
ORD434426 2019-06-14 Winter
ORD387883 2019-10-20 Spring


In [26]:
# Fix date
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()

date_map = {
    "Summer": "2019-01-15",
    "Autumn": "2019-04-15",
    "Winter": "2019-07-15",
    "Spring": "2019-10-15"
}

missing_date = unfixed_data["date"].isna()
print("Number of missing dates:", missing_date.sum())

valid_season = unfixed_data["season"].isin(date_map)

date_index = unfixed_data.index[missing_date & valid_season]

correct_date = pd.to_datetime(unfixed_data.loc[date_index, "season"].map(date_map), errors="coerce")

clean_data.loc[date_index, "date"] = correct_date

fixed_ids = unfixed_data.loc[date_index, "order_id"]
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Missing date"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[date_index, ["order_id", "date", "season"]].to_string(index=False))

Number of missing dates: 27
Fixed 27 rows.
 order_id       date season
ORD164387 2019-01-15 Summer
ORD066446 2019-07-15 Winter
ORD312565 2019-01-15 Summer
ORD181051 2019-01-15 Summer
ORD046408 2019-10-15 Spring
ORD219265 2019-07-15 Winter
ORD006455 2019-10-15 Spring
ORD084861 2019-07-15 Winter
ORD438655 2019-01-15 Summer
ORD234563 2019-10-15 Spring
ORD199817 2019-01-15 Summer
ORD113549 2019-04-15 Autumn
ORD489756 2019-10-15 Spring
ORD273300 2019-10-15 Spring
ORD194653 2019-01-15 Summer
ORD480775 2019-10-15 Spring
ORD160619 2019-07-15 Winter
ORD402436 2019-04-15 Autumn
ORD311888 2019-04-15 Autumn
ORD491911 2019-07-15 Winter
ORD265708 2019-01-15 Summer
ORD060082 2019-01-15 Summer
ORD469475 2019-01-15 Summer
ORD461231 2019-07-15 Winter
ORD047863 2019-04-15 Autumn
ORD499923 2019-10-15 Spring
ORD036565 2019-10-15 Spring


In [27]:
# Fix customer_lat and customer_long
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()

swapped_coords = (unfixed_data["customer_lat"].between(-180, 180) &
                  unfixed_data["customer_long"].between(-90, 90))
print("Number of swapped coordinates:", swapped_coords.sum())

swapped_coords_index = unfixed_data.index[swapped_coords]

clean_data.loc[swapped_coords_index, ["customer_lat", "customer_long"]] = (clean_data.loc[swapped_coords_index, ["customer_long", "customer_lat"]].to_numpy())

fixed_ids = clean_data.loc[swapped_coords_index, "order_id"].tolist()
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Swapped coordinates"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[swapped_coords_index, ["order_id", "customer_lat", "customer_long"]].to_string(index=False))

Number of swapped coordinates: 27
Fixed 27 rows.
 order_id  customer_lat  customer_long
ORD091929    -37.815878     144.959364
ORD299508    -37.823816     145.009445
ORD074143    -37.819701     144.960234
ORD392203    -37.812101     144.973944
ORD208957    -37.810785     144.977354
ORD090831    -37.797624     144.993262
ORD062280    -37.816990     144.961790
ORD155978    -37.824991     144.949411
ORD493957    -37.801182     144.976899
ORD083244    -37.818479     144.977813
ORD373348    -37.793879     144.985178
ORD055195    -37.806415     144.983469
ORD125480    -37.810368     144.961303
ORD285476    -37.802954     144.935968
ORD349254    -37.805420     144.947587
ORD048052    -37.801171     145.004517
ORD479919    -37.815613     144.977507
ORD326763    -37.805420     144.928230
ORD442562    -37.820067     144.968618
ORD063341    -37.827219     144.988143
ORD012510    -37.799791     144.954950
ORD070213    -37.801354     144.946328
ORD481618    -37.816652     144.988204
ORD202182    -3

In [28]:
# Fix nearest_warehouse
# 1. Actual nearest warehouse
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()
unfixed_index = unfixed_data.index

ori_wh = merge_warehouse_dist.loc[unfixed_index, "nearest_warehouse_clean"].astype(str).str.strip()
actual_wh = merge_warehouse_dist.loc[unfixed_index, "actual_nearest_warehouse"].astype(str).str.strip()

incorrect_wh = (ori_wh != actual_wh)
correct_dist = ((merge_warehouse_dist.loc[unfixed_index, "distance_to_nearest_warehouse"] -
                 merge_warehouse_dist.loc[unfixed_index, "actual_nearest_distance"]).abs() <= 0)

wh_fix = incorrect_wh & correct_dist
incorrect_wh_index = merge_warehouse_dist.loc[unfixed_index].index[wh_fix]
print("Number of incorrect nearest warehouse name:", len(incorrect_wh_index))

clean_data.loc[incorrect_wh_index, "nearest_warehouse"] = (merge_warehouse_dist.loc[incorrect_wh_index, "actual_nearest_warehouse"].values)

fixed_ids = clean_data.loc[incorrect_wh_index, "order_id"].values
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Incorrect nearest_warehouse"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[incorrect_wh_index, ["order_id", "nearest_warehouse"]].to_string(index=False))

Number of incorrect nearest warehouse name: 20
Fixed 20 rows.
 order_id nearest_warehouse
ORD237879          Thompson
ORD052629            Bakers
ORD461915          Thompson
ORD069936         Nickolson
ORD461426          Thompson
ORD015774          Thompson
ORD120949          Thompson
ORD216010            Bakers
ORD138742          Thompson
ORD334316         Nickolson
ORD081635         Nickolson
ORD025929          Thompson
ORD201338          Thompson
ORD436885            Bakers
ORD224300          Thompson
ORD471229         Nickolson
ORD164906         Nickolson
ORD118183            Bakers
ORD300512            Bakers
ORD426908          Thompson


In [29]:
# Fix distance_to_nearest_warehouse
# Haversine distance
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()
unfixed_index = unfixed_data.index

ori_wh = merge_warehouse_dist.loc[unfixed_index, "nearest_warehouse_clean"].astype(str).str.strip()
actual_wh = merge_warehouse_dist.loc[unfixed_index, "actual_nearest_warehouse"].astype(str).str.strip()

correct_wh = (ori_wh == actual_wh)

actual_nearest_distance = np.around(merge_warehouse_dist.loc[unfixed_index, "actual_nearest_distance"].values, 4)

ori_distance = merge_warehouse_dist.loc[unfixed_index, "distance_to_nearest_warehouse"].values
mismatch_dist = np.abs(ori_distance - actual_nearest_distance) > 0.1

dist_fix = correct_wh & mismatch_dist
incorrect_dist_index = merge_warehouse_dist.loc[unfixed_index].index[dist_fix]
print("Number of incorrect distance to nearest warehouse:", len(incorrect_dist_index))

clean_data.loc[incorrect_dist_index, "distance_to_nearest_warehouse"] = (merge_warehouse_dist.loc[incorrect_dist_index, "actual_nearest_distance"].values)

fixed_ids = clean_data.loc[incorrect_dist_index, "order_id"].values
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Incorrect distance_to_nearest_warehouse"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[incorrect_dist_index, ["order_id", "distance_to_nearest_warehouse"]].to_string(index=False))

Number of incorrect distance to nearest warehouse: 23
Fixed 23 rows.
 order_id  distance_to_nearest_warehouse
ORD418280                         0.7663
ORD020055                         1.4421
ORD483341                         0.9439
ORD106152                         1.1906
ORD177750                         1.8275
ORD452867                         0.8989
ORD393540                         1.6016
ORD135624                         1.7146
ORD471922                         1.8650
ORD385052                         0.8499
ORD307211                         1.1409
ORD227802                         0.8424
ORD041333                         0.6523
ORD378423                         0.9532
ORD321986                         1.1383
ORD205213                         0.3711
ORD440634                         0.7465
ORD416698                         2.7735
ORD049973                         0.8999
ORD005273                         0.8712
ORD048269                         0.9092
ORD175303                    

In [30]:
# Fix nearest_warehouse
# 2. Naming inconsistency
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()

valid_names = warehouse_data["names"].str.strip().str.title()
invalid_names = ~unfixed_data["nearest_warehouse"].astype(str).str.strip().isin(valid_names)

invalid_name_index = unfixed_data.index[invalid_names]
print("Number of invalid warehouse names:", len(invalid_name_index))

clean_data.loc[invalid_name_index, "nearest_warehouse"] = (clean_data.loc[invalid_name_index, "nearest_warehouse"].astype(str).str.strip().str.title())

fixed_ids = clean_data.loc[invalid_name_index, "order_id"].values
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Inconsistent nearest_warehouse naming"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[invalid_name_index, ["order_id", "nearest_warehouse"]].to_string(index=False))

Number of invalid warehouse names: 7
Fixed 7 rows.
 order_id nearest_warehouse
ORD256861          Thompson
ORD014442          Thompson
ORD166717          Thompson
ORD169718         Nickolson
ORD393258          Thompson
ORD016571         Nickolson
ORD099672          Thompson


In [31]:
# Individual item price
fixed_data = clean_data[clean_data["order_id"].isin(set(fix_log["order_id"]))].copy()

cart_map = shopping_cart_check.set_index("order_id")["shopping_cart_parsed"]
fixed_data["cart_parsed"] = fixed_data["order_id"].map(cart_map)

all_items = [item for cart in fixed_data["cart_parsed"] for (item, qty) in cart]
item_names = pd.Series(all_items).value_counts().index.tolist()

index = {n: i for i, n in enumerate(item_names)}
K = len(item_names)

rows, targets = [], []
for _, r in fixed_data.iterrows():
    v = np.zeros(K)
    for (name, qty) in r["cart_parsed"]:
        if name in index:
            v[index[name]] += float(qty)
            rows.append(v)
            targets.append(float(r["order_price"]))

A = np.vstack(rows)
b = np.array(targets, dtype=float)

unit_prices, *_ = np.linalg.lstsq(A, b, rcond=None)
price_map = pd.Series(unit_prices, index=item_names).round(2)

print(price_map.sort_index())

Alcon 10          8950.0
Candle Inferno     430.0
Lucent 330S       1230.0
Olivia x460       1225.0
Thunder line      2180.0
Toshika 750       4320.0
Universe Note     3450.0
iAssist Line      2225.0
iStream            150.0
pearTV            6310.0
dtype: float64


In [32]:
# Check shopping_cart, order_price, order_total is correct
unfixed_data = clean_data[~clean_data["order_id"].isin(set(fix_log["order_id"]))].copy()
unfixed_data["cart_parsed"] = unfixed_data["order_id"].map(shopping_cart_check.set_index("order_id")["shopping_cart_parsed"])

def order_price_check(cart):
  '''
  Calculates the order_price based on individual item price
  '''
  return round(sum(price_map[item]*float(qty) for item, qty in cart), 2)

# Compute expected values based on individual item price
unfixed_data["expected_order_price"] = unfixed_data["cart_parsed"].apply(order_price_check)
unfixed_data["expected_order_total"] = (unfixed_data["expected_order_price"] * (1 - unfixed_data["coupon_discount"]/100) + unfixed_data["delivery_charges"]).round(2)

# Compute the difference for order_price and order_total
unfixed_data["price_diff"] = (unfixed_data["expected_order_price"] - unfixed_data["order_price"]).abs().round(2)
unfixed_data["total_diff"] = (unfixed_data["expected_order_total"] - unfixed_data["order_total"]).abs().round(2)

# Compare the expected order_price and order_total with the record and check if it matches
unfixed_data["price_match"] = unfixed_data["price_diff"] <= 0
unfixed_data["total_match"] = unfixed_data["total_diff"] <= 0

# Mismatched rows
mismatch_order = unfixed_data[(~unfixed_data["price_match"]) | (~unfixed_data["total_match"])].copy()

# Output
output = ["order_id", "shopping_cart", "order_price",
        "expected_order_price", "price_diff", "price_match",
        "order_total", "expected_order_total", "total_diff", "total_match"]

print(f"Rows mismatch order:", len(mismatch_order))
print(mismatch_order[output].to_string(index=False))

Rows mismatch order: 81
 order_id                                                                        shopping_cart  order_price  expected_order_price  price_diff  price_match  order_total  expected_order_total  total_diff  total_match
ORD012542                                            [('Thunder line', 1), ('Toshika 750', 2)]         6740               10820.0      4080.0        False      9809.79               9809.79        0.00         True
ORD136268                                               [('Alcon 10', 2), ('iAssist Line', 2)]         2600               22350.0     19750.0        False     21331.40              21331.40        0.00         True
ORD396615                              [('Olivia x460', 1), ('Lucent 330S', 1), ('pearTV', 2)]        14010               15075.0      1065.0        False     15151.80              15151.80        0.00         True
ORD038061                               [('Candle Inferno', 2), ('pearTV', 1), ('iStream', 1)]         8920         

In [33]:
# Classify the mismatched rows into shopping_cart error, order_price error, order_total error
# List out the item names
item_names = list(price_map.index)

# Function to check if row will be correct by swapping item
def swap_item(row):
  '''
  Check if order_price and order_total can be correct by swapping between each item in the cart
  '''
  cart = row["cart_parsed"]

  price_diff = float(row["order_price"]  - row["expected_order_price"])
  total_diff = float(row["order_total"]  - row["expected_order_total"])
  discount  = 1 - float(row["coupon_discount"])/100

  for old_name, qty in cart:
    old_p = float(price_map[old_name])
    for new_name in item_names:
      if new_name == old_name:
        continue
      new_p = float(price_map[new_name])
      delta_price = (new_p - old_p) * float(qty)
      if (round(abs(delta_price - price_diff), 2)<= 0 and
          round(abs(delta_price * discount - total_diff), 2) <= 0):
        return True, {"incorrect item": old_name, "correct item": new_name, "qty": float(qty)}
  return False, {}

# Classify the mismatched rows to their error type
error_cat = []
error_details = []

for _, r in mismatch_order.iterrows():
    # shopping_cart error
    can_swap, swap_info = swap_item(r)
    if can_swap:
        error_cat.append("shopping_cart_error")
        error_details.append(swap_info)
        continue

    # order_price error
    incorrect_price = abs(float(r["expected_order_price"] - r["order_price"])) > 0
    correct_total = abs(float(r["expected_order_total"] - r["order_total"])) <= 0
    if incorrect_price and correct_total:
        error_cat.append("order_price_error")
        error_details.append({})
        continue

    # order_total error
    correct_price  = not incorrect_price
    incorrect_total = not correct_total
    if correct_price and incorrect_total:
        error_cat.append("order_total_error")
        error_details.append({})
        continue

    # unknown error
    error_cat.append("unknown_error")
    error_details.append({})

# Create new columns to store error_type and error_details
mismatch_order["error_cat"] = error_cat
mismatch_order["error_details"]  = error_details

# Summary for error type
error_summary = (mismatch_order["error_cat"]
                 .value_counts()
                 .rename_axis("error category")
                 .reset_index(name="rows"))

# Print out the order_id and error_details for shopping_cart error
print(error_summary.to_string(index=False))
print("Detected shopping_cart error:")
print(mismatch_order.loc[mismatch_order["error_cat"]=="shopping_cart_error", ["order_id","error_details"]].to_string(index=False))

     error category  rows
  order_price_error    27
shopping_cart_error    27
  order_total_error    27
Detected shopping_cart error:
 order_id                                                                     error_details
ORD038061   {'incorrect item': 'Candle Inferno', 'correct item': 'Lucent 330S', 'qty': 2.0}
ORD157781       {'incorrect item': 'iStream', 'correct item': 'Candle Inferno', 'qty': 2.0}
ORD135183     {'incorrect item': 'Lucent 330S', 'correct item': 'Thunder line', 'qty': 1.0}
ORD108147        {'incorrect item': 'pearTV', 'correct item': 'Candle Inferno', 'qty': 2.0}
ORD292260    {'incorrect item': 'Lucent 330S', 'correct item': 'Universe Note', 'qty': 2.0}
ORD286146   {'incorrect item': 'Toshika 750', 'correct item': 'Candle Inferno', 'qty': 1.0}
ORD258960         {'incorrect item': 'Olivia x460', 'correct item': 'Alcon 10', 'qty': 2.0}
ORD489080        {'incorrect item': 'iAssist Line', 'correct item': 'Alcon 10', 'qty': 1.0}
ORD063030  {'incorrect item': 'Thunder

In [34]:
# Fix shopping_cart error
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()
unfixed_index = unfixed_data.index

cart_error = (mismatch_order["error_cat"] == "shopping_cart_error")
cart_error_rows = mismatch_order.loc[cart_error, ["order_id", "error_details"]].copy()

cart_error_swap = dict(zip(cart_error_rows["order_id"], cart_error_rows["error_details"]))

def swap_shopping_cart(cart_list, old_name, new_name, qty):
  new_list = []
  swapped = False
  for name, q in cart_list:
    if (not swapped) and (name == old_name) and (float(q) == float(qty)):
      new_list.append((new_name, q))
      swapped = True
    else:
      new_list.append((name, q))
  if not swapped:
    for i, (n, q) in enumerate(new_list):
      if n == old_name:
        new_list[i] == (new_name , q)
        swapped = True
        break
  return new_list, swapped

cart_fix = clean_data["order_id"].isin(cart_error_swap.keys())
cart_index = clean_data.index[cart_fix & clean_data.index.isin(unfixed_index)]

for i in cart_index:
  old_id = clean_data.at[i, "order_id"]
  info = cart_error_swap[old_id]
  old_name = info["incorrect item"]
  new_name = info["correct item"]
  qty = info["qty"]

  raw = clean_data.at[i, "shopping_cart"]

  cart_list = list(ast.literal_eval(raw))

  new_cart, did_swap = swap_shopping_cart(cart_list, old_name, new_name, qty)
  if did_swap:
    clean_data.at[i, "shopping_cart"] = repr(new_cart)

fixed_ids = clean_data.loc[cart_index, "order_id"].values
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Incorrect shopping_cart"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[cart_index, ["order_id", "shopping_cart"]].to_string(index=False))

Fixed 27 rows.
 order_id                                                                       shopping_cart
ORD038061                                 [('Lucent 330S', 2), ('pearTV', 1), ('iStream', 1)]
ORD157781                                         [('Candle Inferno', 2), ('Olivia x460', 2)]
ORD135183           [('pearTV', 2), ('Olivia x460', 2), ('Alcon 10', 1), ('Thunder line', 1)]
ORD108147                                       [('Universe Note', 2), ('Candle Inferno', 2)]
ORD292260         [('Universe Note', 2), ('Toshika 750', 2), ('iStream', 1), ('Alcon 10', 2)]
ORD286146                                       [('Candle Inferno', 1), ('Universe Note', 2)]
ORD258960                              [('Alcon 10', 2), ('pearTV', 1), ('Universe Note', 2)]
ORD489080                             [('Alcon 10', 1), ('Candle Inferno', 2), ('pearTV', 1)]
ORD063030                                            [('Candle Inferno', 2), ('Alcon 10', 2)]
ORD035025                           [('Thunde

In [35]:
# Fix order_price
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()

price_error = (mismatch_order["error_cat"] == "order_price_error")
price_error_rows = set(mismatch_order.loc[price_error, "order_id"])

price_error_unfixed = mismatch_order.loc[mismatch_order["order_id"].isin(price_error_rows), ["order_id", "expected_order_price"]]
expected_price_map = dict(zip(price_error_unfixed["order_id"], price_error_unfixed["expected_order_price"]))

price_fix = clean_data["order_id"].isin(expected_price_map.keys())
price_index = clean_data.index[price_fix]

clean_data.loc[price_index, "order_price"] = (clean_data.loc[price_index, "order_id"].map(expected_price_map))

fixed_ids = clean_data.loc[price_index, "order_id"].values
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Incorrect order_price"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[price_index, ["order_id", "order_price"]].to_string(index=False))

Fixed 27 rows.
 order_id  order_price
ORD012542        10820
ORD136268        22350
ORD396615        15075
ORD276861        22380
ORD097415        22010
ORD282385        20045
ORD368942         7130
ORD181929        21520
ORD377946        10210
ORD431838        11700
ORD189464        10315
ORD221277        19275
ORD294135         9500
ORD007117        10165
ORD354953         2330
ORD455667        20470
ORD203938        13870
ORD267973        25710
ORD486573        24225
ORD411297         5550
ORD470970        11400
ORD363756        15685
ORD113541         7970
ORD291184        23655
ORD199032        15970
ORD208094         7810
ORD023631        11990


In [36]:
# Fix order_total
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()

total_error = (mismatch_order["error_cat"] == "order_total_error")
total_error_rows = set(mismatch_order.loc[total_error, "order_id"])

total_error_unfixed = mismatch_order.loc[mismatch_order["order_id"].isin(total_error_rows), ["order_id", "expected_order_total"]]
expected_total_map = dict(zip(total_error_unfixed["order_id"], total_error_unfixed["expected_order_total"]))

total_fix = clean_data["order_id"].isin(expected_total_map.keys())
total_index = clean_data.index[total_fix]

clean_data.loc[total_index, "order_total"] = (clean_data.loc[total_index, "order_id"].map(expected_total_map))

fixed_ids = clean_data.loc[total_index, "order_id"].values
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Incorrect order_total"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[total_index, ["order_id", "order_total"]].to_string(index=False))

Fixed 27 rows.
 order_id  order_total
ORD159650      3028.94
ORD105180     11957.17
ORD390068     11024.52
ORD098805     13718.93
ORD371831      7534.92
ORD115991      9139.41
ORD460642      3745.31
ORD293767     11196.19
ORD330007     22399.00
ORD319958     22753.58
ORD069151     24451.51
ORD185952      6195.12
ORD052541     23890.43
ORD372162     16792.73
ORD200848     20273.34
ORD405673      8228.93
ORD455855     31816.16
ORD022783     13428.47
ORD153936     12820.31
ORD421787     23579.36
ORD034671      5499.14
ORD421351      2243.80
ORD187672     15613.34
ORD153201     14470.76
ORD496180     14523.18
ORD230545     12202.81
ORD089659      2051.85


In [37]:
# Fix is_happy_customer
unfixed_data = clean_data[~clean_data["order_id"].isin(fix_log["order_id"])].copy()
unfixed_index = unfixed_data.index

incorrect_happy_rows = is_happy_check.loc[~is_happy_check["is_correct"], ["order_id", "happy_prediction"]]

incorrect_happy_id = incorrect_happy_rows["order_id"]

happy_index = clean_data.index[clean_data["order_id"].isin(incorrect_happy_id) & clean_data.index.isin(unfixed_index)]

happy_target = clean_data.loc[happy_index, "order_id"]
happy_fix = (is_happy_check.set_index("order_id").reindex(happy_target)["happy_prediction"].values)

clean_data.loc[happy_index, "is_happy_customer"] = happy_fix

fixed_ids = clean_data.loc[happy_index, "order_id"].values
fix_log = pd.concat([fix_log,
                     pd.DataFrame({"order_id": fixed_ids,
                                   "error_type": "Incorrect is_happy_customer"})],
                    ignore_index=True)

print(f"Fixed {len(fixed_ids)} rows.")
print(clean_data.loc[happy_index, ["order_id", "is_happy_customer"]].to_string(index=False))

Fixed 27 rows.
 order_id  is_happy_customer
ORD216249               True
ORD412492               True
ORD457652               True
ORD066764               True
ORD352239               True
ORD268941               True
ORD478343               True
ORD064373               True
ORD370767               True
ORD252102              False
ORD330702               True
ORD408565               True
ORD480194               True
ORD494528               True
ORD083198              False
ORD241933              False
ORD246197               True
ORD208028               True
ORD251878               True
ORD405488               True
ORD115461               True
ORD363854               True
ORD435481               True
ORD256544               True
ORD319183               True
ORD102139               True
ORD366292              False


In [38]:
# # Check if is_expedited_delivery is correct with linear model

# # Plot a boxplot for distribution
# sns.boxplot(data=expedited_check,
#             x="is_expedited_delivery",
#             y="delivery_charges",
#             hue="season_clean")

# # Run linear model to check the R2 score
# cols = ["is_expedited_delivery", "delivery_charges", "season"]

1.2.2 Outlier Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# --- Make a backup of outlier dataframe ---
df = outlier_data.copy()

# --- Step 1: Prepare features ---
X = df[['distance_to_nearest_warehouse', 'is_expedited_delivery', 'is_happy_customer', 'season']]
y = df['delivery_charges']

# Convert boolean to integer
X['is_expedited_delivery'] = X['is_expedited_delivery'].astype(int)
X['is_happy_customer'] = X['is_happy_customer'].astype(int)

# One-hot encode season (drop first to avoid multicollinearity)
encoder = OneHotEncoder(drop='first', sparse_output=False)
season_encoded = encoder.fit_transform(X[['season']])
season_encoded_df = pd.DataFrame(season_encoded, columns=encoder.get_feature_names_out(['season']))

# Combine base features and season dummies
X_base = X.drop(columns=['season']).reset_index(drop=True)
X_encoded = pd.concat([X_base, season_encoded_df], axis=1)

# --- Step 2: Add interaction terms between season and numeric predictors ---
for season_col in season_encoded_df.columns:
    for feature in ['distance_to_nearest_warehouse', 'is_expedited_delivery', 'is_happy_customer']:
        interaction_name = f"{feature}_x_{season_col}"
        X_encoded[interaction_name] = X_base[feature] * season_encoded_df[season_col]

# --- Step 3: Fit linear model ---
model = LinearRegression()
model.fit(X_encoded, y)

# --- Step 4: Compute predictions and residuals ---
df['predicted_delivery_charge'] = model.predict(X_encoded)
df['residual'] = df['delivery_charges'] - df['predicted_delivery_charge']

# --- Step 5: Robust Z-score (Median + MAD) of residuals ---
median_resid = df['residual'].median()
mad_resid = np.median(np.abs(df['residual'] - median_resid))
robust_z = 0.6745 * (df['residual'] - median_resid) / mad_resid

outlier_robust_mask = np.abs(robust_z) > 3.5   # Common robust z-score threshold
outliers_robust = df[outlier_robust_mask]

print("\n=== Outliers using robust z-score ===")
print(outliers_robust[['order_id', 'delivery_charges', 'predicted_delivery_charge', 'residual',]])

# --- Step 6: Filter outliers ---
filtered_robust = df[~outlier_robust_mask].reset_index(drop=True)

print(f"Removed {outliers_robust.shape[0]} outliers (robust-based), remaining: {filtered_robust.shape[0]}")
filtered_robust.to_csv('Group_035_outlier_data_solution.csv', index=False, na_rep="NaN")



=== Outliers using robust z-score ===
      order_id  delivery_charges  predicted_delivery_charge   residual
17   ORD089831           144.450                  96.864118  47.585882
28   ORD322216            71.475                  49.605252  21.869748
93   ORD276933            39.570                  76.640083 -37.070083
98   ORD142753            39.010                  73.782575 -34.772575
104  ORD317663            73.815                  51.904774  21.910226
109  ORD233575           119.955                  80.004734  39.950266
123  ORD099918            39.790                  80.272405 -40.482405
131  ORD171684           102.555                  72.340548  30.214452
137  ORD272158            33.335                  64.123499 -30.788499
143  ORD414922           143.385                  97.169650  46.215350
145  ORD482557           164.775                 112.698279  52.076721
167  ORD141312           126.705                  88.620992  38.084008
168  ORD109768            26.515      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['is_expedited_delivery'] = X['is_expedited_delivery'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['is_happy_customer'] = X['is_happy_customer'].astype(int)


1.2.3 Missing Data

In [40]:
# Show how much missing data there is
print(missing_data.isna().sum())

order_id                          0
customer_id                       0
date                              0
nearest_warehouse                55
shopping_cart                     0
order_price                      15
delivery_charges                 40
customer_lat                      0
customer_long                     0
coupon_discount                   0
order_total                      15
season                            0
is_expedited_delivery             0
distance_to_nearest_warehouse    31
latest_customer_review            0
is_happy_customer                40
dtype: int64


In [None]:
# # Build catalog of items and their respective prices
# import ast
# import numpy as np


# # Collect unique items
# all_items = sorted({item for cart in missing_data['parsed_cart'] for item, _ in cart})
# item_index = {item: i for i, item in enumerate(all_items)}

# # Build system of equations A x = b
# rows, b = [], []
# for _, row in missing_data.iterrows():
#     if pd.notna(row['order_price']) and row['parsed_cart']:
#         vec = np.zeros(len(all_items))
#         for item, qty in row['parsed_cart']:
#             vec[item_index[item]] += qty
#         rows.append(vec)
#         b.append(row['order_price'])

# A = np.vstack(rows)
# b = np.array(b)

# # Solve least squares for unit prices
# x, _, _, _ = np.linalg.lstsq(A, b, rcond=None)

# # Round to 2 decimals for catalog - prices should not have more than 1 cent precision
# x = np.round(x, 2)

# # Build catalog
# catalog = {item: price for item, price in zip(all_items, x)}


In [None]:
# Impute missing data
# Parse shopping_cart into list of tuples
def parse_cart(cart_str):
    if pd.isna(cart_str):
        return []
    try:
        return ast.literal_eval(cart_str)
    except Exception:
        return []

missing_data['parsed_cart'] = missing_data['shopping_cart'].apply(parse_cart)
# --- Imputation logic ---
# We determine the values to impute from the columns from which they are dependent on.
# This is imputation by rule calculation, and is definitively correct assuming the relationships and the other columns are correct.
def impute_row(row, warehouses, price_map):
    # --- Impute nearest_warehouse ---
    if pd.isna(row['nearest_warehouse']) and pd.notna(row['customer_lat']) and pd.notna(row['customer_long']):
        distances = warehouses.apply(
            lambda wh: haversine_dist(row['customer_lat'], row['customer_long'], wh['lat'], wh['lon']), axis=1
        )
        nearest_idx = distances.idxmin()
        row['nearest_warehouse'] = warehouses.loc[nearest_idx, 'names']
        row['distance_to_nearest_warehouse'] = distances.min()

    # --- Impute order_price ---
    if pd.isna(row['order_price']):
        if pd.notna(row['order_total']) and pd.notna(row['delivery_charges']) and pd.notna(row['coupon_discount']):
            denom = (100 - row['coupon_discount']) / 100
            if denom != 0:
                row['order_price'] = (row['order_total'] - row['delivery_charges']) / denom
        elif pd.isna(row['order_total']) and row['parsed_cart']:  # fallback to catalog
            if row['parsed_cart']:
                items, qtys = zip(*row['parsed_cart'])
                prices = price_map.reindex(items).fillna(0).values
                row['order_price'] = np.dot(prices, qtys)

    # --- Impute delivery_charges ---
    if pd.isna(row['delivery_charges']):
        if pd.notna(row['order_total']) and pd.notna(row['order_price']) and pd.notna(row['coupon_discount']):
            denom = (100 - row['coupon_discount']) / 100
            row['delivery_charges'] = row['order_total'] - row['order_price'] * denom

    # --- Impute order_total ---
    if pd.isna(row['order_total']):
        if pd.notna(row['order_price']) and pd.notna(row['coupon_discount']) and pd.notna(row['delivery_charges']):
            denom = (100 - row['coupon_discount']) / 100
            row['order_total'] = row['order_price'] * denom + row['delivery_charges']

    # --- Impute distance_to_nearest_warehouse ---
    if pd.isna(row['distance_to_nearest_warehouse']):
        if pd.notna(row['nearest_warehouse']) and pd.notna(row['customer_lat']) and pd.notna(row['customer_long']):
            wh = warehouses.loc[warehouses['names'] == row['nearest_warehouse']].iloc[0]
            row['distance_to_nearest_warehouse'] = haversine_dist(row['customer_lat'], row['customer_long'], wh['lat'], wh['lon'])

    # --- Impute is_happy_customer from sentiment---
    if pd.isna(row['is_happy_customer']) and pd.notna(row['latest_customer_review']):
        sentiment = sia.polarity_scores(str(row['latest_customer_review']))
        row['is_happy_customer'] = sentiment['compound'] >= 0.05

    return row

# Apply
missing_data_imputed = missing_data.apply(lambda r: impute_row(r, warehouse_data, price_map), axis=1)
missing_data_imputed.to_csv('Group_035_missing_data_solution.csv', index=False, na_rep="NaN")

In [None]:
# price_map
# # Individual item price
# fixed_data = clean_data[clean_data["order_id"].isin(set(fix_log["order_id"]))].copy()

# cart_map = shopping_cart_check.set_index("order_id")["shopping_cart_parsed"]
# fixed_data["cart_parsed"] = fixed_data["order_id"].map(cart_map)

# all_items = [item for cart in fixed_data["cart_parsed"] for (item, qty) in cart]
# item_names = pd.Series(all_items).value_counts().index.tolist()

# index = {n: i for i, n in enumerate(item_names)}
# K = len(item_names)

# rows, targets = [], []
# for _, r in fixed_data.iterrows():
#     v = np.zeros(K)
#     for (name, qty) in r["cart_parsed"]:
#         if name in index:
#             v[index[name]] += float(qty)
#             rows.append(v)
#             targets.append(float(r["order_price"]))

# A = np.vstack(rows)
# b = np.array(targets, dtype=float)

# unit_prices, *_ = np.linalg.lstsq(A, b, rcond=None)
# price_map = pd.Series(unit_prices, index=item_names).round(2)

# print(price_map.sort_index())

In [45]:
# Verify imputation caught all missing data
if sum(missing_data_imputed.isna().sum()) == 0:
    print("No missing data remaining\n")
else:
    print(missing_data_imputed.isna().sum())
    rows_with_nulls = missing_data_imputed[missing_data_imputed.isnull().any(axis=1)]
    print(rows_with_nulls)

# print("Catalog item price:")
# for item in catalog:
#     print(f"{item} price: {catalog[item]}")

No missing data remaining



#### 1.1.3 Documentation
1. Date: There are 27 columns with NA values, which accounts for 5.4% of the dataset.