In [1]:
import pandas as pd

path = "project_dataset/python_raw_data/fake_orders_test_updated.csv/updated_dataset.csv"

df = pd.read_csv(path)
df.describe()

Unnamed: 0,order_id,store_address,products,products_total,purchase_total_price
count,60400.0,60400.0,60400.0,60400.0,60400.0
mean,32782650.0,41580.723692,2.811358,9.832693,10.703447
std,445362.7,21905.784563,2.431214,9.276841,10.781632
min,31503780.0,190.0,1.0,0.0,0.0
25%,32400910.0,21110.0,1.0,4.11,4.33
50%,32781100.0,45849.0,2.0,7.12,7.98
75%,33182790.0,61992.25,3.0,12.75,14.16
max,33560850.0,75236.0,39.0,221.48,265.76


In [2]:
df.head(10)

Unnamed: 0,order_id,activation_time_local,country_code,store_address,final_status,payment_status,products,products_total,purchase_total_price,authorized_or_not
0,31503775,2019-03-01T11:43:08.000+01:00,ES,15871,DeliveredStatus,PAID,1,1.85,14.02,under_authorized
1,31503965,2019-03-01T11:43:08.000+01:00,ES,15871,DeliveredStatus,PAID,3,6.15,12.21,under_authorized
2,31636675,2019-03-01T08:58:01.000+01:00,AR,61807,DeliveredStatus,PAID,4,1.18,9.76,under_authorized
3,31724509,2019-03-01T16:43:04.000+01:00,ES,16228,DeliveredStatus,PAID,5,11.07,12.52,under_authorized
4,31839133,2019-03-01T08:43:24.000+01:00,BR,20443,DeliveredStatus,PAID,1,1.29,11.19,under_authorized
5,31848304,2019-03-01T07:33:01.000+01:00,BR,49346,CanceledStatus,PAID,8,6.01,0.0,correctly_authorized
6,31867130,2019-03-01T10:03:11.000+01:00,ES,18683,DeliveredStatus,PAID,9,17.72,17.71,correctly_authorized
7,31867210,2019-03-01T12:03:07.000+01:00,IT,12347,CanceledStatus,PAID,2,18.0,0.0,correctly_authorized
8,31868928,2019-03-01T10:03:07.000+01:00,ES,29445,DeliveredStatus,PAID,18,24.65,31.68,under_authorized
9,31876071,2019-03-01T09:03:08.000+01:00,ES,16217,CanceledStatus,PAID,8,12.87,12.69,correctly_authorized


### 1. What percent of orders are under-authorized?

In [3]:
count_authorized_or_not = df['authorized_or_not'].value_counts()
count_authorized_or_not

total_data = count_authorized_or_not.sum() - ((df["final_status"] == "CanceledStatus").sum())

percentage_under_authorized =(count_authorized_or_not["under_authorized"]/total_data)*100

print(f"The percentage of under authorized orders is: {percentage_under_authorized:.2f}%")

The percentage of under authorized orders is: 64.53%


### 2. What percent of orders would be correctly authorized with incremental authorization (+20%) on the amount at checkout?

In [4]:
incremental_authorization = df["products_total"]*1.2
df["incremental_authorization"] = incremental_authorization

# compare the incremental_authorization price with purchase_total_price, using a filter
filter_1 = df["incremental_authorization"] > df["purchase_total_price"]

# count when filter returns true = when orders are incrementally authorized
new_auth_count = filter_1.sum()

# count percentage of orders authorized after incremental
count_original_authorised = (df["authorized_or_not"]=="correctly_authorized").sum()

new_percentage_order =((new_auth_count - count_original_authorised)/total_data) * 100
print(f"The percentage of orders being authorized after 20% incremental on price is:{new_percentage_order:.2f}%")


The percentage of orders being authorized after 20% incremental on price is:29.78%


### 3. Are there differences when split by country?

In [5]:
# define a funtion to calculate percentage of increased authorized orders by increment
def percentage_new_orders(df, count_original_auth):

    filter_1 = df["incremental_authorization"] > df["purchase_total_price"]
    total = len(df)
    increse_auth_order = ((filter_1.sum() - count_original_auth)/total) * 100

    return increse_auth_order

# extract country_code as unique
countries = df["country_code"].unique()

for m in countries:
    df_country = df[df["country_code"] == m]

    count_original_auth = (df_country["authorized_or_not"] == "correctly_authorized").sum()
    
    country_percentage = percentage_new_orders(df_country, count_original_auth)
    print(f"{m}: {country_percentage:.2f}% of incremental authorization")

# pass dictionary into function to return percentage of each country


ES: 33.65% of incremental authorization
AR: 26.95% of incremental authorization
BR: 0.00% of incremental authorization
IT: 25.25% of incremental authorization
FR: 32.91% of incremental authorization
RO: 18.60% of incremental authorization
UA: 21.29% of incremental authorization
TR: 19.52% of incremental authorization
EG: 28.50% of incremental authorization
MA: 18.20% of incremental authorization
PE: 33.92% of incremental authorization
CL: 0.00% of incremental authorization
EC: 26.75% of incremental authorization
GE: 20.15% of incremental authorization
KE: 29.79% of incremental authorization
PT: 22.00% of incremental authorization
DO: 25.60% of incremental authorization
PA: 27.64% of incremental authorization
CR: 24.75% of incremental authorization
UY: 22.11% of incremental authorization
GT: 12.87% of incremental authorization
PR: 30.30% of incremental authorization
CI: 0.00% of incremental authorization


### 4. For the remainder of orders that would be outside of incremental authorization, what values would be necessary to capture the remaining amount?

In [6]:
# create new DF that only contains under authorized orders after 20% incremental, and their original prices
filter_2 = df["incremental_authorization"] < df["purchase_total_price"]
df_under_auth = df.loc[filter_2,["order_id", "products_total", "purchase_total_price"]]

df_under_auth

Unnamed: 0,order_id,products_total,purchase_total_price
0,31503775,1.85,14.02
1,31503965,6.15,12.21
2,31636675,1.18,9.76
4,31839133,1.29,11.19
8,31868928,24.65,31.68
...,...,...,...
60374,33559487,6.16,12.31
60376,33559535,2.89,5.12
60380,33559739,6.32,11.43
60383,33559833,3.76,8.55


In [7]:
df_under_auth["incremental_required"] = (df_under_auth["purchase_total_price"] - df_under_auth["products_total"])/df_under_auth["products_total"]*100
df_under_auth

orders = df_under_auth["order_id"].unique()

for m in orders:
    value_required = df_under_auth.loc[df_under_auth["order_id"] == m, "incremental_required"].values[0]
    print(f"Order {m} requires {value_required:.2f}% of increase in order to be authorized")

Order 31503775 requires 657.84% of increase in order to be authorized
Order 31503965 requires 98.54% of increase in order to be authorized
Order 31636675 requires 727.12% of increase in order to be authorized
Order 31839133 requires 767.44% of increase in order to be authorized
Order 31868928 requires 28.52% of increase in order to be authorized
Order 31890972 requires 220.59% of increase in order to be authorized
Order 31908582 requires 84.39% of increase in order to be authorized
Order 31934292 requires 528.76% of increase in order to be authorized
Order 31949773 requires 84.54% of increase in order to be authorized
Order 31950805 requires 22.99% of increase in order to be authorized
Order 31965637 requires 100.29% of increase in order to be authorized
Order 31968802 requires 113.41% of increase in order to be authorized
Order 31969123 requires 65.38% of increase in order to be authorized
Order 31969645 requires 100.00% of increase in order to be authorized
Order 31972576 requires 76

### 5. Which stores are the most problematic in terms of orders and monetary value?

In [8]:
stores = df["store_address"].unique()
print(stores)

[15871 61807 16228 ... 73131 58894 52057]


In [9]:
cancellation_rate_data = {}

for m in stores:
    df_stores = df[df["store_address"] == m]
    
    count_cancelled_order = (df_stores["final_status"] == "CanceledStatus").sum()
    cancellation_rate = (count_cancelled_order/len(df_stores))*100
    
    cancellation_rate_data[m] = cancellation_rate
    
print(cancellation_rate_data)



{15871: 6.41025641025641, 61807: 6.666666666666667, 16228: 0.0, 20443: 0.0, 49346: 100.0, 18683: 4.3478260869565215, 12347: 100.0, 29445: 0.0, 16217: 5.88235294117647, 3160: 3.571428571428571, 70011: 0.0, 15659: 0.0, 21664: 17.391304347826086, 5273: 5.263157894736842, 16231: 6.451612903225806, 16276: 14.285714285714285, 29449: 2.0, 10706: 8.0, 15698: 0.0, 30123: 5.263157894736842, 71710: 0.0, 68820: 0.0, 34953: 0.0, 66297: 22.727272727272727, 66577: 0.0, 36481: 100.0, 66308: 22.22222222222222, 44528: 2.857142857142857, 47263: 20.0, 10696: 14.035087719298245, 52885: 3.8461538461538463, 69014: 8.108108108108109, 44473: 3.614457831325301, 58966: 0.0, 16202: 0.0, 59206: 14.285714285714285, 45651: 11.29032258064516, 47493: 0.0, 28725: 5.128205128205128, 34471: 8.0, 61693: 16.666666666666664, 59912: 6.666666666666667, 21293: 33.33333333333333, 39805: 0.0, 21292: 0.0, 68810: 0.0, 18682: 3.3333333333333335, 53878: 18.181818181818183, 34210: 33.33333333333333, 1463: 0.0, 26932: 33.3333333333333

In [10]:
df_cancellation_rate = pd.DataFrame(list(cancellation_rate_data.items()), columns=["store", "cancellation_rate"])

df_cancellation_rate_sorted = df_cancellation_rate.sort_values("cancellation_rate", ascending=False)

df_cancellation_rate_sorted.head()

Unnamed: 0,store,cancellation_rate
5754,52057,100.0
4086,57755,100.0
4093,32201,100.0
4121,62246,100.0
4123,66397,100.0


In [11]:
# ranking by number of under_authorized orders
under_authorized_list = []

for m in stores:
    df_stores = df[df["store_address"] == m]
    
    under_authorized_count = (df_stores["authorized_or_not"] == "under_authorized").sum()

    under_authorized_list.append([m, under_authorized_count])

print(under_authorized_list)
    


[[15871, 54], [61807, 14], [16228, 1], [20443, 1], [49346, 0], [18683, 16], [12347, 0], [29445, 30], [16217, 10], [3160, 27], [70011, 8], [15659, 5], [21664, 10], [5273, 11], [16231, 19], [16276, 4], [29449, 35], [10706, 36], [15698, 2], [30123, 37], [71710, 0], [68820, 3], [34953, 3], [66297, 15], [66577, 8], [36481, 0], [66308, 1], [44528, 28], [47263, 17], [10696, 40], [52885, 17], [69014, 8], [44473, 65], [58966, 3], [16202, 8], [59206, 10], [45651, 21], [47493, 12], [28725, 6], [34471, 13], [61693, 3], [59912, 10], [21293, 0], [39805, 7], [21292, 2], [68810, 2], [18682, 25], [53878, 5], [34210, 1], [1463, 19], [26932, 2], [72651, 19], [528, 38], [3063, 16], [64309, 12], [44520, 58], [53437, 0], [55159, 2], [62504, 1], [50175, 1], [14455, 179], [33453, 8], [63293, 0], [62623, 10], [57711, 3], [62524, 2], [52671, 2], [13555, 0], [4394, 9], [61798, 3], [55115, 83], [52981, 0], [17323, 47], [25709, 22], [33382, 5], [50374, 7], [29774, 14], [70008, 3], [59011, 12], [72880, 3], [21467, 

In [12]:
df_under_auth = pd.DataFrame(under_authorized_list, columns=["stores", "count_no_authorized"])

df_under_auth.sort_values("count_no_authorized", ascending=False).head(10)

Unnamed: 0,stores,count_no_authorized
730,28671,448
767,28712,217
803,28286,204
60,14455,179
1582,11694,160
814,28669,158
559,12513,126
966,55206,120
1827,27635,116
239,62935,111


### 6. For under-authorized orders, is there a correlation between the difference in the prices and the cancellation of the order? In other words: Is an order more likely to be cancelled as the price difference increases?

In [33]:
# extract columns of products_total, purchase_total_price and final_status
filter_3 = df["authorized_or_not"] == "under_authorized"
df_extract = df.loc[filter_3, ["products_total", "purchase_total_price", "final_status"]]

# add a new column of price diffference
df_extract["price_differ"] = df_extract["purchase_total_price"] - df["products_total"]

# assign alias for final_status: DeliveredStatus-1, CanceledStatus-0
alias = {"DeliveredStatus": 1, "CanceledStatus": 0}

df_extract["final_status"] = df_extract["final_status"].map(alias)

df_extract

Unnamed: 0,products_total,purchase_total_price,final_status,price_differ
0,1.85,14.02,1,12.17
1,6.15,12.21,1,6.06
2,1.18,9.76,1,8.58
3,11.07,12.52,1,1.45
4,1.29,11.19,1,9.90
...,...,...,...,...
60389,8.65,10.59,1,1.94
60390,6.05,7.26,1,1.21
60391,6.59,7.39,1,0.80
60393,2.69,3.09,1,0.40


In [42]:
# check if there are null value
df_extract["final_status"].isnull().sum()


# calculate correlation between column "final_status" = CanceledStatus and "price_differ"
# print(df_extract["price_differ"]).corr(df_extract["final_status"])
correlation = df_extract["price_differ"].corr(df_extract["final_status"])
correlation

-0.12372105249553941