# Data analysis via pandas

## Loading data in jupyter notebook

In [1]:
# First I'll import libraries I'm going to use
import pandas as pd
import json

In [2]:
data = []
with open ('dataset/data.ndjson', "r") as file:
    for line in file:
        json_data = json.loads(line)
        data.append(json_data)    

In [3]:
df=pd.DataFrame(data)

In [4]:
# Control data
df.head()

Unnamed: 0,id,created,products,user
0,0,1542328144,"[{'id': 0, 'name': 'Product A', 'price': 160},...","{'id': 3, 'name': 'User D', 'city': 'Sydney'}"
1,1,1544115833,"[{'id': 3, 'name': 'Product D', 'price': 130},...","{'id': 0, 'name': 'User A', 'city': 'Prague'}"
2,2,1540143218,"[{'id': 6, 'name': 'Product G', 'price': 100},...","{'id': 0, 'name': 'User A', 'city': 'Prague'}"
3,3,1542051492,"[{'id': 6, 'name': 'Product G', 'price': 100},...","{'id': 2, 'name': 'User C', 'city': 'Singapore'}"
4,4,1543571130,"[{'id': 10, 'name': 'Product K', 'price': 60},...","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}"


In [5]:
pd.set_option('display.max_colwidth', 1)

In [6]:
# In the data I can see I have a nested structure inside. Also, column created needs to be converted and adjusted. I continue with simple exploratory analysis
df

Unnamed: 0,id,created,products,user
0,0,1542328144,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 3, 'name': 'User D', 'city': 'Sydney'}"
1,1,1544115833,"[{'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 13, 'name': 'Product N', 'price': 30}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}"
2,2,1540143218,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 7, 'name': 'Product H', 'price': 90}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}"
3,3,1542051492,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 7, 'name': 'Product H', 'price': 90}, {'id': 14, 'name': 'Product O', 'price': 20}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 0, 'name': 'Product A', 'price': 160}, {'id': 5, 'name': 'Product F', 'price': 110}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 10, 'name': 'Product K', 'price': 60}]","{'id': 2, 'name': 'User C', 'city': 'Singapore'}"
4,4,1543571130,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}"
...,...,...,...,...
99995,99995,1537263824,"[{'id': 2, 'name': 'Product C', 'price': 140}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}"
99996,99996,1539411617,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 6, 'name': 'Product G', 'price': 100}]","{'id': 4, 'name': 'User E', 'city': 'Melbourne'}"
99997,99997,1539889882,"[{'id': 15, 'name': 'Product P', 'price': 10}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 1, 'name': 'Product B', 'price': 150}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 4, 'name': 'Product E', 'price': 120}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}"
99998,99998,1539265746,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}"


In [7]:
df.columns

Index(['id', 'created', 'products', 'user'], dtype='object')

In [8]:
df.dtypes

id          int64 
created     int64 
products    object
user        object
dtype: object

In [9]:
# Check for missing data
print(df.isnull().sum())

id          0
created     0
products    0
user        0
dtype: int64


<p> Right now, this is enough. I'll execute preprocesseing and then I continue exploring the data.</p>

## Convert column "created" to daytime and determine if day is friday
<b> In this case, I decided to convert column created into datetime format.

In [11]:
df['created'] = pd.to_datetime(df['created'], unit='s', utc=True)
df['created'] = df['created'].dt.tz_convert('Europe/Prague')

<b>Add column for friday

In [13]:
df['is_friday'] = df['created'].dt.dayofweek == 4
friday_purchase = df[df['is_friday']]
print(friday_purchase)

          id                   created  \
0      0     2018-11-16 01:29:04+01:00   
4      4     2018-11-30 10:45:30+01:00   
8      8     2018-09-21 02:11:06+02:00   
15     15    2018-11-16 14:09:34+01:00   
17     17    2018-10-19 15:33:17+02:00   
...    ..                          ...   
99959  99959 2018-11-09 21:10:09+01:00   
99961  99961 2018-12-07 22:35:11+01:00   
99970  99970 2018-11-23 14:03:01+01:00   
99979  99979 2018-11-16 08:34:59+01:00   
99986  99986 2018-11-09 14:00:16+01:00   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                          products  \
0      [{'id': 0, 'name':

In [14]:
# Control if original df is updated
df

Unnamed: 0,id,created,products,user,is_friday
0,0,2018-11-16 01:29:04+01:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True
1,1,2018-12-06 18:03:53+01:00,"[{'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 13, 'name': 'Product N', 'price': 30}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False
2,2,2018-10-21 19:33:38+02:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 7, 'name': 'Product H', 'price': 90}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False
3,3,2018-11-12 20:38:12+01:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 7, 'name': 'Product H', 'price': 90}, {'id': 14, 'name': 'Product O', 'price': 20}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 0, 'name': 'Product A', 'price': 160}, {'id': 5, 'name': 'Product F', 'price': 110}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 10, 'name': 'Product K', 'price': 60}]","{'id': 2, 'name': 'User C', 'city': 'Singapore'}",False
4,4,2018-11-30 10:45:30+01:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True
...,...,...,...,...,...
99995,99995,2018-09-18 11:43:44+02:00,"[{'id': 2, 'name': 'Product C', 'price': 140}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False
99996,99996,2018-10-13 08:20:17+02:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 6, 'name': 'Product G', 'price': 100}]","{'id': 4, 'name': 'User E', 'city': 'Melbourne'}",False
99997,99997,2018-10-18 21:11:22+02:00,"[{'id': 15, 'name': 'Product P', 'price': 10}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 1, 'name': 'Product B', 'price': 150}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 4, 'name': 'Product E', 'price': 120}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False
99998,99998,2018-10-11 15:49:06+02:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False


## Extract data from nested structure into separate columns

Nested structure of columns products and user will be extracted into separated columns in order to easily access the information and continue the analysis

In [15]:
# Mapping the products
df['product_id'] = df['products'].apply(lambda x: [product['id'] for product in x])
df['product_name'] = df['products'].apply(lambda x: [product['name'] for product in x])
df['product_price'] = df['products'].apply(lambda x: [product['price'] for product in x])

In [16]:
df

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price
0,0,2018-11-16 01:29:04+01:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,"[0, 8]","[Product A, Product I]","[160, 80]"
1,1,2018-12-06 18:03:53+01:00,"[{'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 13, 'name': 'Product N', 'price': 30}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[3, 2, 13]","[Product D, Product C, Product N]","[130, 140, 30]"
2,2,2018-10-21 19:33:38+02:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 7, 'name': 'Product H', 'price': 90}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[6, 7]","[Product G, Product H]","[100, 90]"
3,3,2018-11-12 20:38:12+01:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 7, 'name': 'Product H', 'price': 90}, {'id': 14, 'name': 'Product O', 'price': 20}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 0, 'name': 'Product A', 'price': 160}, {'id': 5, 'name': 'Product F', 'price': 110}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 10, 'name': 'Product K', 'price': 60}]","{'id': 2, 'name': 'User C', 'city': 'Singapore'}",False,"[6, 13, 7, 14, 8, 0, 5, 13, 10]","[Product G, Product N, Product H, Product O, Product I, Product A, Product F, Product N, Product K]","[100, 30, 90, 20, 80, 160, 110, 30, 60]"
4,4,2018-11-30 10:45:30+01:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True,"[10, 3, 2, 12]","[Product K, Product D, Product C, Product M]","[60, 130, 140, 40]"
...,...,...,...,...,...,...,...,...
99995,99995,2018-09-18 11:43:44+02:00,"[{'id': 2, 'name': 'Product C', 'price': 140}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[2, 2, 10, 8]","[Product C, Product C, Product K, Product I]","[140, 140, 60, 80]"
99996,99996,2018-10-13 08:20:17+02:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 6, 'name': 'Product G', 'price': 100}]","{'id': 4, 'name': 'User E', 'city': 'Melbourne'}",False,"[10, 8, 6]","[Product K, Product I, Product G]","[60, 80, 100]"
99997,99997,2018-10-18 21:11:22+02:00,"[{'id': 15, 'name': 'Product P', 'price': 10}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 1, 'name': 'Product B', 'price': 150}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 4, 'name': 'Product E', 'price': 120}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False,"[15, 10, 4, 1, 2, 4, 8, 4]","[Product P, Product K, Product E, Product B, Product C, Product E, Product I, Product E]","[10, 60, 120, 150, 140, 120, 80, 120]"
99998,99998,2018-10-11 15:49:06+02:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False,"[0, 12]","[Product A, Product M]","[160, 40]"


In [17]:
# Repeat same for user
df['user_id'] = df['user'].apply(lambda x: x['id'])
df['user_name'] = df['user'].apply(lambda x: x['name'])
df['user_city'] = df['user'].apply(lambda x: x['city'])

In [18]:
df

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price,user_id,user_name,user_city
0,0,2018-11-16 01:29:04+01:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,"[0, 8]","[Product A, Product I]","[160, 80]",3,User D,Sydney
1,1,2018-12-06 18:03:53+01:00,"[{'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 13, 'name': 'Product N', 'price': 30}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[3, 2, 13]","[Product D, Product C, Product N]","[130, 140, 30]",0,User A,Prague
2,2,2018-10-21 19:33:38+02:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 7, 'name': 'Product H', 'price': 90}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[6, 7]","[Product G, Product H]","[100, 90]",0,User A,Prague
3,3,2018-11-12 20:38:12+01:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 7, 'name': 'Product H', 'price': 90}, {'id': 14, 'name': 'Product O', 'price': 20}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 0, 'name': 'Product A', 'price': 160}, {'id': 5, 'name': 'Product F', 'price': 110}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 10, 'name': 'Product K', 'price': 60}]","{'id': 2, 'name': 'User C', 'city': 'Singapore'}",False,"[6, 13, 7, 14, 8, 0, 5, 13, 10]","[Product G, Product N, Product H, Product O, Product I, Product A, Product F, Product N, Product K]","[100, 30, 90, 20, 80, 160, 110, 30, 60]",2,User C,Singapore
4,4,2018-11-30 10:45:30+01:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True,"[10, 3, 2, 12]","[Product K, Product D, Product C, Product M]","[60, 130, 140, 40]",5,User F,Jakarta
...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2018-09-18 11:43:44+02:00,"[{'id': 2, 'name': 'Product C', 'price': 140}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[2, 2, 10, 8]","[Product C, Product C, Product K, Product I]","[140, 140, 60, 80]",0,User A,Prague
99996,99996,2018-10-13 08:20:17+02:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 6, 'name': 'Product G', 'price': 100}]","{'id': 4, 'name': 'User E', 'city': 'Melbourne'}",False,"[10, 8, 6]","[Product K, Product I, Product G]","[60, 80, 100]",4,User E,Melbourne
99997,99997,2018-10-18 21:11:22+02:00,"[{'id': 15, 'name': 'Product P', 'price': 10}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 1, 'name': 'Product B', 'price': 150}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 4, 'name': 'Product E', 'price': 120}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False,"[15, 10, 4, 1, 2, 4, 8, 4]","[Product P, Product K, Product E, Product B, Product C, Product E, Product I, Product E]","[10, 60, 120, 150, 140, 120, 80, 120]",5,User F,Jakarta
99998,99998,2018-10-11 15:49:06+02:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False,"[0, 12]","[Product A, Product M]","[160, 40]",5,User F,Jakarta


In [19]:
# Creating new column and assigning the row_id to handle analysis. Side note: I already have id but I like to create new range to have the original intact data in case there was an issue during loading the data.
df['row_id'] = range(len(df))

In [20]:
df

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price,user_id,user_name,user_city,row_id
0,0,2018-11-16 01:29:04+01:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,"[0, 8]","[Product A, Product I]","[160, 80]",3,User D,Sydney,0
1,1,2018-12-06 18:03:53+01:00,"[{'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 13, 'name': 'Product N', 'price': 30}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[3, 2, 13]","[Product D, Product C, Product N]","[130, 140, 30]",0,User A,Prague,1
2,2,2018-10-21 19:33:38+02:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 7, 'name': 'Product H', 'price': 90}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[6, 7]","[Product G, Product H]","[100, 90]",0,User A,Prague,2
3,3,2018-11-12 20:38:12+01:00,"[{'id': 6, 'name': 'Product G', 'price': 100}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 7, 'name': 'Product H', 'price': 90}, {'id': 14, 'name': 'Product O', 'price': 20}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 0, 'name': 'Product A', 'price': 160}, {'id': 5, 'name': 'Product F', 'price': 110}, {'id': 13, 'name': 'Product N', 'price': 30}, {'id': 10, 'name': 'Product K', 'price': 60}]","{'id': 2, 'name': 'User C', 'city': 'Singapore'}",False,"[6, 13, 7, 14, 8, 0, 5, 13, 10]","[Product G, Product N, Product H, Product O, Product I, Product A, Product F, Product N, Product K]","[100, 30, 90, 20, 80, 160, 110, 30, 60]",2,User C,Singapore,3
4,4,2018-11-30 10:45:30+01:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 3, 'name': 'Product D', 'price': 130}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True,"[10, 3, 2, 12]","[Product K, Product D, Product C, Product M]","[60, 130, 140, 40]",5,User F,Jakarta,4
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2018-09-18 11:43:44+02:00,"[{'id': 2, 'name': 'Product C', 'price': 140}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}]","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[2, 2, 10, 8]","[Product C, Product C, Product K, Product I]","[140, 140, 60, 80]",0,User A,Prague,99995
99996,99996,2018-10-13 08:20:17+02:00,"[{'id': 10, 'name': 'Product K', 'price': 60}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 6, 'name': 'Product G', 'price': 100}]","{'id': 4, 'name': 'User E', 'city': 'Melbourne'}",False,"[10, 8, 6]","[Product K, Product I, Product G]","[60, 80, 100]",4,User E,Melbourne,99996
99997,99997,2018-10-18 21:11:22+02:00,"[{'id': 15, 'name': 'Product P', 'price': 10}, {'id': 10, 'name': 'Product K', 'price': 60}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 1, 'name': 'Product B', 'price': 150}, {'id': 2, 'name': 'Product C', 'price': 140}, {'id': 4, 'name': 'Product E', 'price': 120}, {'id': 8, 'name': 'Product I', 'price': 80}, {'id': 4, 'name': 'Product E', 'price': 120}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False,"[15, 10, 4, 1, 2, 4, 8, 4]","[Product P, Product K, Product E, Product B, Product C, Product E, Product I, Product E]","[10, 60, 120, 150, 140, 120, 80, 120]",5,User F,Jakarta,99997
99998,99998,2018-10-11 15:49:06+02:00,"[{'id': 0, 'name': 'Product A', 'price': 160}, {'id': 12, 'name': 'Product M', 'price': 40}]","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",False,"[0, 12]","[Product A, Product M]","[160, 40]",5,User F,Jakarta,99998


## Creating new df_expanded and exploding multiple values into new rows

In [21]:
df['product_counts'] = df['products'].apply(lambda x: pd.Series([item['id'] for item in x]).value_counts().to_dict())

# Create a list to hold the exploded rows
exploded_rows = []

# Iterate over each row and explode based on product counts
for _, row in df.iterrows():
    order_id = _  # Use the index as order_id
    product_counts = row['product_counts']

    for product_id, quantity in product_counts.items():
        exploded_rows.append({'order_id': order_id, 'product_id': product_id, 'quantity': quantity})

exploded_df = pd.DataFrame(exploded_rows)

# Merge with the original DataFrame
df_expanded = exploded_df.merge(df.explode('products'), left_on='order_id', right_index=True)

# Drop unnecessary columns
df_expanded = df_expanded.drop(columns=['products', 'product_counts'])

# Print the result
print(df_expanded)

        order_id  product_id_x  quantity     id                   created  \
0       0         0             1         0     2018-11-16 01:29:04+01:00   
0       0         0             1         0     2018-11-16 01:29:04+01:00   
1       0         8             1         0     2018-11-16 01:29:04+01:00   
1       0         8             1         0     2018-11-16 01:29:04+01:00   
2       1         3             1         1     2018-12-06 18:03:53+01:00   
...    ..        ..            ..        ..                           ...   
458075  99999     2             1         99999 2018-10-18 23:03:30+02:00   
458075  99999     2             1         99999 2018-10-18 23:03:30+02:00   
458075  99999     2             1         99999 2018-10-18 23:03:30+02:00   
458075  99999     2             1         99999 2018-10-18 23:03:30+02:00   
458075  99999     2             1         99999 2018-10-18 23:03:30+02:00   

                                                       user  is_friday  \
0

In [22]:
df_expanded

Unnamed: 0,order_id,product_id_x,quantity,id,created,user,is_friday,product_id_y,product_name,product_price,user_id,user_name,user_city,row_id
0,0,0,1,0,2018-11-16 01:29:04+01:00,"{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,"[0, 8]","[Product A, Product I]","[160, 80]",3,User D,Sydney,0
0,0,0,1,0,2018-11-16 01:29:04+01:00,"{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,"[0, 8]","[Product A, Product I]","[160, 80]",3,User D,Sydney,0
1,0,8,1,0,2018-11-16 01:29:04+01:00,"{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,"[0, 8]","[Product A, Product I]","[160, 80]",3,User D,Sydney,0
1,0,8,1,0,2018-11-16 01:29:04+01:00,"{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,"[0, 8]","[Product A, Product I]","[160, 80]",3,User D,Sydney,0
2,1,3,1,1,2018-12-06 18:03:53+01:00,"{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,"[3, 2, 13]","[Product D, Product C, Product N]","[130, 140, 30]",0,User A,Prague,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458075,99999,2,1,99999,2018-10-18 23:03:30+02:00,"{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,"[7, 5, 3, 12, 4, 0, 2]","[Product H, Product F, Product D, Product M, Product E, Product A, Product C]","[90, 110, 130, 40, 120, 160, 140]",7,User H,Kuala Lumpur,99999
458075,99999,2,1,99999,2018-10-18 23:03:30+02:00,"{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,"[7, 5, 3, 12, 4, 0, 2]","[Product H, Product F, Product D, Product M, Product E, Product A, Product C]","[90, 110, 130, 40, 120, 160, 140]",7,User H,Kuala Lumpur,99999
458075,99999,2,1,99999,2018-10-18 23:03:30+02:00,"{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,"[7, 5, 3, 12, 4, 0, 2]","[Product H, Product F, Product D, Product M, Product E, Product A, Product C]","[90, 110, 130, 40, 120, 160, 140]",7,User H,Kuala Lumpur,99999
458075,99999,2,1,99999,2018-10-18 23:03:30+02:00,"{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,"[7, 5, 3, 12, 4, 0, 2]","[Product H, Product F, Product D, Product M, Product E, Product A, Product C]","[90, 110, 130, 40, 120, 160, 140]",7,User H,Kuala Lumpur,99999


In [23]:
df_expanded = df.explode('products')

In [24]:
df_expanded['product_id'] = df_expanded['products'].apply(lambda x: x['id'])
df_expanded['product_name'] = df_expanded['products'].apply(lambda x: x['name'])
df_expanded['product_price'] = df_expanded['products'].apply(lambda x: x['price'])
df_expanded['quantity'] = df_expanded.apply(lambda row: row['product_id'], axis=1).map(df['product_counts'])

In [25]:
df_expanded

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price,user_id,user_name,user_city,row_id,product_counts,quantity
0,0,2018-11-16 01:29:04+01:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,0,Product A,160,3,User D,Sydney,0,"{0: 1, 8: 1}","{0: 1, 8: 1}"
0,0,2018-11-16 01:29:04+01:00,"{'id': 8, 'name': 'Product I', 'price': 80}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,8,Product I,80,3,User D,Sydney,0,"{0: 1, 8: 1}","{7: 3, 5: 1, 14: 1, 12: 1, 8: 1, 10: 1, 1: 1}"
1,1,2018-12-06 18:03:53+01:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,3,Product D,130,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}","{13: 2, 6: 1, 7: 1, 14: 1, 8: 1, 0: 1, 5: 1, 10: 1}"
1,1,2018-12-06 18:03:53+01:00,"{'id': 2, 'name': 'Product C', 'price': 140}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,2,Product C,140,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}","{6: 1, 7: 1}"
1,1,2018-12-06 18:03:53+01:00,"{'id': 13, 'name': 'Product N', 'price': 30}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,13,Product N,30,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}","{3: 1, 9: 1, 13: 1, 5: 1, 14: 1, 0: 1}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,3,Product D,130,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}","{13: 2, 6: 1, 7: 1, 14: 1, 8: 1, 0: 1, 5: 1, 10: 1}"
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 12, 'name': 'Product M', 'price': 40}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,12,Product M,40,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}","{12: 2, 2: 1, 14: 1, 8: 1, 0: 1, 3: 1}"
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 4, 'name': 'Product E', 'price': 120}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,4,Product E,120,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}","{10: 1, 3: 1, 2: 1, 12: 1}"
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,0,Product A,160,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}","{0: 1, 8: 1}"


In [26]:
df_expanded['quantity'] = df_expanded.groupby('id')['id'].transform('count')


In [27]:
df_expanded.head(20)

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price,user_id,user_name,user_city,row_id,product_counts,quantity
0,0,2018-11-16 01:29:04+01:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,0,Product A,160,3,User D,Sydney,0,"{0: 1, 8: 1}",2
0,0,2018-11-16 01:29:04+01:00,"{'id': 8, 'name': 'Product I', 'price': 80}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,8,Product I,80,3,User D,Sydney,0,"{0: 1, 8: 1}",2
1,1,2018-12-06 18:03:53+01:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,3,Product D,130,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
1,1,2018-12-06 18:03:53+01:00,"{'id': 2, 'name': 'Product C', 'price': 140}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,2,Product C,140,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
1,1,2018-12-06 18:03:53+01:00,"{'id': 13, 'name': 'Product N', 'price': 30}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,13,Product N,30,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
2,2,2018-10-21 19:33:38+02:00,"{'id': 6, 'name': 'Product G', 'price': 100}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,6,Product G,100,0,User A,Prague,2,"{6: 1, 7: 1}",2
2,2,2018-10-21 19:33:38+02:00,"{'id': 7, 'name': 'Product H', 'price': 90}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,7,Product H,90,0,User A,Prague,2,"{6: 1, 7: 1}",2
3,3,2018-11-12 20:38:12+01:00,"{'id': 6, 'name': 'Product G', 'price': 100}","{'id': 2, 'name': 'User C', 'city': 'Singapore'}",False,6,Product G,100,2,User C,Singapore,3,"{13: 2, 6: 1, 7: 1, 14: 1, 8: 1, 0: 1, 5: 1, 10: 1}",9
3,3,2018-11-12 20:38:12+01:00,"{'id': 13, 'name': 'Product N', 'price': 30}","{'id': 2, 'name': 'User C', 'city': 'Singapore'}",False,13,Product N,30,2,User C,Singapore,3,"{13: 2, 6: 1, 7: 1, 14: 1, 8: 1, 0: 1, 5: 1, 10: 1}",9
3,3,2018-11-12 20:38:12+01:00,"{'id': 7, 'name': 'Product H', 'price': 90}","{'id': 2, 'name': 'User C', 'city': 'Singapore'}",False,7,Product H,90,2,User C,Singapore,3,"{13: 2, 6: 1, 7: 1, 14: 1, 8: 1, 0: 1, 5: 1, 10: 1}",9


## Check exploded data

In [28]:
# Check missing data
(df_expanded.isnull().sum())


id                0
created           0
products          0
user              0
is_friday         0
product_id        0
product_name      0
product_price     0
user_id           0
user_name         0
user_city         0
row_id            0
product_counts    0
quantity          0
dtype: int64

In [29]:
# Unique values in a column product_name
print(df_expanded['product_name'].unique())

# Value counts product_name
print(df_expanded['product_name'].value_counts())

['Product A' 'Product I' 'Product D' 'Product C' 'Product N' 'Product G'
 'Product H' 'Product O' 'Product F' 'Product K' 'Product M' 'Product B'
 'Product L' 'Product J' 'Product E' 'Product P']
product_name
Product J    34576
Product L    34567
Product I    34455
Product M    34402
Product B    34392
Product C    34372
Product D    34366
Product O    34346
Product K    34345
Product E    34316
Product P    34288
Product A    34265
Product N    34222
Product G    34152
Product F    34054
Product H    34027
Name: count, dtype: int64


In [30]:
total_rows = df_expanded.shape[0]
print("Total Rows:", total_rows)

Total Rows: 549145


In [31]:
# Unique values in a column product_name
print(df_expanded['user_city'].unique())
print(df_expanded['user_name'].unique())


['Sydney' 'Prague' 'Singapore' 'Jakarta' 'Melbourne' 'Kuala Lumpur'
 'Hong Kong' 'Brno']
['User D' 'User A' 'User C' 'User F' 'User E' 'User H' 'User G' 'User B']


## Question 1: Spending users on all fridays

Answering the question "Which user spent the most money on products on all Fridays?"

In [32]:
# Selecting only fridays from adjusted df
only_friday = df_expanded[df_expanded['is_friday']]

In [33]:
only_friday.head(20)

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price,user_id,user_name,user_city,row_id,product_counts,quantity
0,0,2018-11-16 01:29:04+01:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,0,Product A,160,3,User D,Sydney,0,"{0: 1, 8: 1}",2
0,0,2018-11-16 01:29:04+01:00,"{'id': 8, 'name': 'Product I', 'price': 80}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,8,Product I,80,3,User D,Sydney,0,"{0: 1, 8: 1}",2
4,4,2018-11-30 10:45:30+01:00,"{'id': 10, 'name': 'Product K', 'price': 60}","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True,10,Product K,60,5,User F,Jakarta,4,"{10: 1, 3: 1, 2: 1, 12: 1}",4
4,4,2018-11-30 10:45:30+01:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True,3,Product D,130,5,User F,Jakarta,4,"{10: 1, 3: 1, 2: 1, 12: 1}",4
4,4,2018-11-30 10:45:30+01:00,"{'id': 2, 'name': 'Product C', 'price': 140}","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True,2,Product C,140,5,User F,Jakarta,4,"{10: 1, 3: 1, 2: 1, 12: 1}",4
4,4,2018-11-30 10:45:30+01:00,"{'id': 12, 'name': 'Product M', 'price': 40}","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}",True,12,Product M,40,5,User F,Jakarta,4,"{10: 1, 3: 1, 2: 1, 12: 1}",4
8,8,2018-09-21 02:11:06+02:00,"{'id': 7, 'name': 'Product H', 'price': 90}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",True,7,Product H,90,0,User A,Prague,8,"{7: 3, 5: 1, 14: 1, 12: 1, 8: 1, 10: 1, 1: 1}",9
8,8,2018-09-21 02:11:06+02:00,"{'id': 5, 'name': 'Product F', 'price': 110}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",True,5,Product F,110,0,User A,Prague,8,"{7: 3, 5: 1, 14: 1, 12: 1, 8: 1, 10: 1, 1: 1}",9
8,8,2018-09-21 02:11:06+02:00,"{'id': 14, 'name': 'Product O', 'price': 20}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",True,14,Product O,20,0,User A,Prague,8,"{7: 3, 5: 1, 14: 1, 12: 1, 8: 1, 10: 1, 1: 1}",9
8,8,2018-09-21 02:11:06+02:00,"{'id': 12, 'name': 'Product M', 'price': 40}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",True,12,Product M,40,0,User A,Prague,8,"{7: 3, 5: 1, 14: 1, 12: 1, 8: 1, 10: 1, 1: 1}",9


In [34]:
spending_row_id = df_expanded[df_expanded['is_friday'] & (df_expanded['row_id'] == 15 )]
row_id = spending_row_id.groupby('user_id')['product_price'].sum()
row_id

user_id
0    120
Name: product_price, dtype: int64

In [35]:
# Grouping by row_id and summing product_prices
total_spending_row = only_friday.groupby('user_id')['product_price'].sum()

In [36]:
total_spending_row.sort_values(ascending=False)

user_id
4    869810
1    862980
5    855470
2    835880
3    831690
7    825720
6    825350
0    814700
Name: product_price, dtype: int64

## Most money spent in all Fridays was by user E with id = 4, out of city Melbourne when purchasing multiple products in total cumulative price of 869810 (in corresponding currency)

## Question 2: best 3 products in each location of a user based on quantity

In [38]:
df_expanded

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price,user_id,user_name,user_city,row_id,product_counts,quantity
0,0,2018-11-16 01:29:04+01:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,0,Product A,160,3,User D,Sydney,0,"{0: 1, 8: 1}",2
0,0,2018-11-16 01:29:04+01:00,"{'id': 8, 'name': 'Product I', 'price': 80}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,8,Product I,80,3,User D,Sydney,0,"{0: 1, 8: 1}",2
1,1,2018-12-06 18:03:53+01:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,3,Product D,130,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
1,1,2018-12-06 18:03:53+01:00,"{'id': 2, 'name': 'Product C', 'price': 140}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,2,Product C,140,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
1,1,2018-12-06 18:03:53+01:00,"{'id': 13, 'name': 'Product N', 'price': 30}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,13,Product N,30,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,3,Product D,130,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 12, 'name': 'Product M', 'price': 40}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,12,Product M,40,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 4, 'name': 'Product E', 'price': 120}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,4,Product E,120,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,0,Product A,160,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7


In [39]:
# Expand for user
df_expanded['user_id'] = df_expanded['user'].apply(lambda x: x['id'])
df_expanded['user_name'] = df_expanded['user'].apply(lambda x: x['name'])
df_expanded['user_city'] = df_expanded['user'].apply(lambda x: x['city'])

In [40]:
df_expanded

Unnamed: 0,id,created,products,user,is_friday,product_id,product_name,product_price,user_id,user_name,user_city,row_id,product_counts,quantity
0,0,2018-11-16 01:29:04+01:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,0,Product A,160,3,User D,Sydney,0,"{0: 1, 8: 1}",2
0,0,2018-11-16 01:29:04+01:00,"{'id': 8, 'name': 'Product I', 'price': 80}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",True,8,Product I,80,3,User D,Sydney,0,"{0: 1, 8: 1}",2
1,1,2018-12-06 18:03:53+01:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,3,Product D,130,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
1,1,2018-12-06 18:03:53+01:00,"{'id': 2, 'name': 'Product C', 'price': 140}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,2,Product C,140,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
1,1,2018-12-06 18:03:53+01:00,"{'id': 13, 'name': 'Product N', 'price': 30}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",False,13,Product N,30,0,User A,Prague,1,"{3: 1, 2: 1, 13: 1}",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,3,Product D,130,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 12, 'name': 'Product M', 'price': 40}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,12,Product M,40,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 4, 'name': 'Product E', 'price': 120}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,4,Product E,120,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",False,0,Product A,160,7,User H,Kuala Lumpur,99999,"{7: 1, 5: 1, 3: 1, 12: 1, 4: 1, 0: 1, 2: 1}",7


In [236]:
# Drop not important columns and adjust order if neccessary
columns_to_drop = ['is_friday', 'product_price_total']
df_expanded = df_expanded.drop(columns=columns_to_drop)

KeyError: "['is_friday', 'product_price_total'] not found in axis"

In [41]:
column_order = ['id', 'created', 'products', 'user', 'product_id', 'product_name', 'product_price', 'user_id', 'user_name', 'user_city', 'row_id' ]
df_expanded = df_expanded[column_order]

In [42]:
df_expanded

Unnamed: 0,id,created,products,user,product_id,product_name,product_price,user_id,user_name,user_city,row_id
0,0,2018-11-16 01:29:04+01:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",0,Product A,160,3,User D,Sydney,0
0,0,2018-11-16 01:29:04+01:00,"{'id': 8, 'name': 'Product I', 'price': 80}","{'id': 3, 'name': 'User D', 'city': 'Sydney'}",8,Product I,80,3,User D,Sydney,0
1,1,2018-12-06 18:03:53+01:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",3,Product D,130,0,User A,Prague,1
1,1,2018-12-06 18:03:53+01:00,"{'id': 2, 'name': 'Product C', 'price': 140}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",2,Product C,140,0,User A,Prague,1
1,1,2018-12-06 18:03:53+01:00,"{'id': 13, 'name': 'Product N', 'price': 30}","{'id': 0, 'name': 'User A', 'city': 'Prague'}",13,Product N,30,0,User A,Prague,1
...,...,...,...,...,...,...,...,...,...,...,...
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 3, 'name': 'Product D', 'price': 130}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",3,Product D,130,7,User H,Kuala Lumpur,99999
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 12, 'name': 'Product M', 'price': 40}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",12,Product M,40,7,User H,Kuala Lumpur,99999
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 4, 'name': 'Product E', 'price': 120}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",4,Product E,120,7,User H,Kuala Lumpur,99999
99999,99999,2018-10-18 23:03:30+02:00,"{'id': 0, 'name': 'Product A', 'price': 160}","{'id': 7, 'name': 'User H', 'city': 'Kuala Lumpur'}",0,Product A,160,7,User H,Kuala Lumpur,99999


In [43]:
# Group data
grouped_data = df_expanded.groupby(['user_id', 'user_city', 'product_id'])


In [44]:
# Determine size and add column quantity
df_grouped = grouped_data.size().reset_index(name='quantity')

In [45]:
sorted_data = df_grouped.sort_values(['user_city', 'quantity'], ascending=[True, False])


In [46]:
sorted_data.groupby('user_city').head(3)

Unnamed: 0,user_id,user_city,product_id,quantity
27,1,Brno,11,4438
30,1,Brno,14,4413
29,1,Brno,13,4386
104,6,Hong Kong,8,4415
111,6,Hong Kong,15,4397
110,6,Hong Kong,14,4369
83,5,Jakarta,3,4328
89,5,Jakarta,9,4309
82,5,Jakarta,2,4305
113,7,Kuala Lumpur,1,4446


In [47]:
# Determine top product per city
top_products_per_city = sorted_data.groupby('user_city').head(3)

In [48]:
top_products_per_city.reset_index(drop=True, inplace=True)
top_products_per_city

Unnamed: 0,user_id,user_city,product_id,quantity
0,1,Brno,11,4438
1,1,Brno,14,4413
2,1,Brno,13,4386
3,6,Hong Kong,8,4415
4,6,Hong Kong,15,4397
5,6,Hong Kong,14,4369
6,5,Jakarta,3,4328
7,5,Jakarta,9,4309
8,5,Jakarta,2,4305
9,7,Kuala Lumpur,1,4446


In [49]:
# Merge data from original df to exctract data information about product
result_data = pd.merge(top_products_per_city[['user_id', 'user_city', 'product_id', 'quantity']],
                       df_expanded[['user_id', 'product_id', 'product_name', 'product_price']],
                       on=['user_id', 'product_id'], how='left')

# Drop duplicate rows to keep only the 25 rows from top_products_per_city
result_data = result_data.drop_duplicates(subset=['user_id', 'product_id'])

# Display the result
result_data.reset_index(drop=True, inplace=True)


# Best 3 products for each city based on quantity with specification of product_name and product_price

In [50]:
print(result_data)

    user_id     user_city  product_id  quantity product_name  product_price
0   1        Brno          11          4438      Product L    50           
1   1        Brno          14          4413      Product O    20           
2   1        Brno          13          4386      Product N    30           
3   6        Hong Kong     8           4415      Product I    80           
4   6        Hong Kong     15          4397      Product P    10           
5   6        Hong Kong     14          4369      Product O    20           
6   5        Jakarta       3           4328      Product D    130          
7   5        Jakarta       9           4309      Product J    70           
8   5        Jakarta       2           4305      Product C    140          
9   7        Kuala Lumpur  1           4446      Product B    150          
10  7        Kuala Lumpur  11          4338      Product L    50           
11  7        Kuala Lumpur  9           4336      Product J    70           
12  4       