# Different approach for analysing dataframe

In [153]:
import pandas as pd
import json

with open('dataset/data.ndjson', 'r') as file:
    data = [json.loads(line) for line in file]

# Normalize the data and handle nested structures
df = pd.json_normalize(data, sep='_')



# Flatten the 'products' column with a prefix
df_products = pd.json_normalize(data, record_path='products', meta=['id', 'created', ['user', 'id'], ['user', 'name'], ['user', 'city']],
                                 sep='_', errors='ignore', meta_prefix='order_')

# Display the resulting DataFrame
df_products


Unnamed: 0,id,name,price,order_id,order_created,order_user_id,order_user_name,order_user_city
0,0,Product A,160,0,1542328144,3,User D,Sydney
1,8,Product I,80,0,1542328144,3,User D,Sydney
2,3,Product D,130,1,1544115833,0,User A,Prague
3,2,Product C,140,1,1544115833,0,User A,Prague
4,13,Product N,30,1,1544115833,0,User A,Prague
...,...,...,...,...,...,...,...,...
549140,3,Product D,130,99999,1539896610,7,User H,Kuala Lumpur
549141,12,Product M,40,99999,1539896610,7,User H,Kuala Lumpur
549142,4,Product E,120,99999,1539896610,7,User H,Kuala Lumpur
549143,0,Product A,160,99999,1539896610,7,User H,Kuala Lumpur


### This time I'm following my sql solutions, thus I'm not using quantity column, but rather for each order_id I'm adding multiple rows of products, even if it is the same product.

In [155]:
df_result = df_products
# Convert 'created' to datetime.dt.tz_localize(None)
df_result['order_created'] = pd.to_datetime(df_result['order_created'], unit='s', utc=True)
df_result['order_created'] = df_result['order_created'].dt.tz_convert('Europe/Prague')

# Rename columns for clarity
df_result = df_result.rename(columns={'order_id': 'order_id', 'order_created': 'created', 'id': 'product_id', 'order_user_id': 'user_id', 'order_user_name': 'user_name', 'order_user_city': 'user_city'})

# Display the resulting DataFrame
df_result


Unnamed: 0,product_id,name,price,order_id,created,user_id,user_name,user_city
0,0,Product A,160,0,2018-11-16 01:29:04+01:00,3,User D,Sydney
1,8,Product I,80,0,2018-11-16 01:29:04+01:00,3,User D,Sydney
2,3,Product D,130,1,2018-12-06 18:03:53+01:00,0,User A,Prague
3,2,Product C,140,1,2018-12-06 18:03:53+01:00,0,User A,Prague
4,13,Product N,30,1,2018-12-06 18:03:53+01:00,0,User A,Prague
...,...,...,...,...,...,...,...,...
549140,3,Product D,130,99999,2018-10-18 23:03:30+02:00,7,User H,Kuala Lumpur
549141,12,Product M,40,99999,2018-10-18 23:03:30+02:00,7,User H,Kuala Lumpur
549142,4,Product E,120,99999,2018-10-18 23:03:30+02:00,7,User H,Kuala Lumpur
549143,0,Product A,160,99999,2018-10-18 23:03:30+02:00,7,User H,Kuala Lumpur


In [156]:
# Selecting only Fridays
df_fridays = df_result[df_result['created'].dt.dayofweek == 4]  # Monday is 0, Sunday is 6
df_fridays

Unnamed: 0,product_id,name,price,order_id,created,user_id,user_name,user_city
0,0,Product A,160,0,2018-11-16 01:29:04+01:00,3,User D,Sydney
1,8,Product I,80,0,2018-11-16 01:29:04+01:00,3,User D,Sydney
16,10,Product K,60,4,2018-11-30 10:45:30+01:00,5,User F,Jakarta
17,3,Product D,130,4,2018-11-30 10:45:30+01:00,5,User F,Jakarta
18,2,Product C,140,4,2018-11-30 10:45:30+01:00,5,User F,Jakarta
...,...,...,...,...,...,...,...,...
549064,14,Product O,20,99979,2018-11-16 08:34:59+01:00,2,User C,Singapore
549087,15,Product P,10,99986,2018-11-09 14:00:16+01:00,1,User B,Brno
549088,14,Product O,20,99986,2018-11-09 14:00:16+01:00,1,User B,Brno
549089,1,Product B,150,99986,2018-11-09 14:00:16+01:00,1,User B,Brno


In [157]:
df_fridays

# Calculate the total spending for each order
df_fridays['total_spending'] = df_fridays['price']

# Group by user_id, order_id, and sum the total spending
total_spending_per_user_order_friday = df_fridays.groupby(['user_id', 'order_id'])['total_spending'].sum().reset_index()

# Group by user_id and sum the total spending
total_spending_per_user_friday = total_spending_per_user_order_friday.groupby('user_id')['total_spending'].sum().reset_index()

# Display the resulting DataFrames with the total spending for each user and each order on Friday
print("Total Spending Per User and Order on Friday:")
print(total_spending_per_user_order_friday)

print("\nTotal Spending Per User on Friday:")
print(total_spending_per_user_friday)

Total Spending Per User and Order on Friday:
       user_id  order_id  total_spending
0            0         8             730
1            0        15             120
2            0        92             600
3            0       175             140
4            0       177             720
...        ...       ...             ...
14386        7     99643            1020
14387        7     99659             780
14388        7     99722             280
14389        7     99873             120
14390        7     99887             890

[14391 rows x 3 columns]

Total Spending Per User on Friday:
   user_id  total_spending
0        0          814700
1        1          862980
2        2          835880
3        3          831690
4        4          869810
5        5          855470
6        6          825350
7        7          825720


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fridays['total_spending'] = df_fridays['price']


In [158]:
total_spending_per_user_friday.sort_values(by='total_spending', ascending=False, inplace=True)
total_spending_per_user_friday

Unnamed: 0,user_id,total_spending
4,4,869810
1,1,862980
5,5,855470
2,2,835880
3,3,831690
7,7,825720
6,6,825350
0,0,814700


In [159]:

# Group by user and calculate the total spending for each user
user_total_spending = df_fridays.groupby(['user_id', 'user_name', 'user_city'])['total_spending'].sum().reset_index()

# Display the result
user_total_spending.sort_values(by='total_spending', ascending=False, inplace=True)
user_total_spending.rename(columns={'total_spending': 'total_spending_on_friday'}, inplace=True)
user_total_spending.head(10)


Unnamed: 0,user_id,user_name,user_city,total_spending_on_friday
4,4,User E,Melbourne,869810
1,1,User B,Brno,862980
5,5,User F,Jakarta,855470
2,2,User C,Singapore,835880
3,3,User D,Sydney,831690
7,7,User H,Kuala Lumpur,825720
6,6,User G,Hong Kong,825350
0,0,User A,Prague,814700


## Most money spent in all Fridays was by user E with id = 4, out of city Melbourne when purchasing multiple products in total cumulative price of 869810 (in corresponding currency)

In [165]:
grouped_data = df_result.groupby(['user_id', 'user_city', 'product_id'])

In [166]:
df_grouped = grouped_data.size().reset_index(name='quantity')

In [167]:
sorted_data = df_grouped.sort_values(['user_city', 'quantity'], ascending=[True, False])


In [170]:
result_data = sorted_data.groupby('user_city').head(3)

## Best 3 products for each city based on quantity with specification of product_name and product_price

In [171]:
print(result_data)

     user_id     user_city  product_id  quantity
27         1          Brno          11      4438
30         1          Brno          14      4413
29         1          Brno          13      4386
104        6     Hong Kong           8      4415
111        6     Hong Kong          15      4397
110        6     Hong Kong          14      4369
83         5       Jakarta           3      4328
89         5       Jakarta           9      4309
82         5       Jakarta           2      4305
113        7  Kuala Lumpur           1      4446
123        7  Kuala Lumpur          11      4338
121        7  Kuala Lumpur           9      4336
73         4     Melbourne           9      4438
67         4     Melbourne           3      4400
72         4     Melbourne           8      4391
2          0        Prague           2      4400
11         0        Prague          11      4388
6          0        Prague           6      4362
47         2     Singapore          15      4479
45         2     Sin