In [1]:
import pandas as pd

In [2]:
# read item.csv
df1 = pd.read_csv('../../datasets/annex/item.csv')
df1.head(3)

Unnamed: 0,Item Code,Item Name,Category Code,Category Name
0,102900005115168,Niushou Shengcai,1011010101,Flower/Leaf Vegetables
1,102900005115199,Sichuan Red Cedar,1011010101,Flower/Leaf Vegetables
2,102900005115625,Local Xiaomao Cabbage,1011010101,Flower/Leaf Vegetables


In [3]:
# check data types, shape and null values
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Item Code      251 non-null    int64 
 1   Item Name      251 non-null    object
 2   Category Code  251 non-null    int64 
 3   Category Name  251 non-null    object
dtypes: int64(2), object(2)
memory usage: 8.0+ KB


In [4]:
# check if item code duplicates
df1['Item Code'].duplicated().value_counts()

Item Code
False    251
Name: count, dtype: int64

In [5]:
# split item names where duplication name starts
df1['Item Name'] = df1['Item Name'].apply(lambda x : x.split('(')[0])
df1.sample(3)

Unnamed: 0,Item Code,Item Name,Category Code,Category Name
71,102900011032589,Yuxingcao,1011010101,Flower/Leaf Vegetables
140,102900005117209,7 Colour Pepper,1011010504,Capsicum
55,102900011029688,Shuanggou Cabbage,1011010101,Flower/Leaf Vegetables


In [6]:
# check for duplications names
df1['Item Name'].duplicated().value_counts()

Item Name
False    194
True      57
Name: count, dtype: int64

In [7]:
# drop the duplicates keeping only first value
df1 = df1.drop_duplicates(subset=['Item Name'], keep='first')
df1.shape

(194, 4)

In [8]:
# check the frequency of categories
df1['Category Name'].value_counts()

Category Name
Flower/Leaf Vegetables         94
Edible Mushroom                41
Capsicum                       35
Aquatic Tuberous Vegetables    14
Solanum                         7
Cabbage                         3
Name: count, dtype: int64

In [9]:
def rename_category(data):
    if data == 'Flower/Leaf Vegetables':
        return 'Flower'
    if data == 'Edible Mushroom':
        return 'Mushroom'
    if data == 'Aquatic Tuberous Vegetables':
        return 'Aquatic'
    return data


# rename all categories to readable value
df1['Category Name'] = df1['Category Name'].apply(rename_category)
df1['Category Name'].value_counts()

Category Name
Flower      94
Mushroom    41
Capsicum    35
Aquatic     14
Solanum      7
Cabbage      3
Name: count, dtype: int64

In [10]:
# lowercases all item names and category names
df1['Item Name'] = df1['Item Name'].str.lower()
df1['Category Name'] = df1['Category Name'].str.lower()
df1.sample(3)

Unnamed: 0,Item Code,Item Name,Category Code,Category Name
66,102900011030905,yuxingcao,1011010101,flower
137,102900005116905,green hangjiao,1011010504,capsicum
220,102900011032619,xixia xianggu mushroom,1011010801,mushroom


In [11]:
# read sale.csv
df2 = pd.read_csv('../../datasets/annex/sale.csv')
df2.head(3)

Unnamed: 0,Date,Time,Item Code,Quantity Sold (kilo),Unit Selling Price (RMB/kg),Sale or Return,Discount (Yes/No)
0,2020-07-01,09:15:07.924,102900005117056,0.396,7.6,sale,No
1,2020-07-01,09:17:27.295,102900005115960,0.849,3.2,sale,No
2,2020-07-01,09:17:33.905,102900005117056,0.409,7.6,sale,No


In [12]:
# check data types, shape and null values
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878503 entries, 0 to 878502
Data columns (total 7 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Date                         878503 non-null  object 
 1   Time                         878503 non-null  object 
 2   Item Code                    878503 non-null  int64  
 3   Quantity Sold (kilo)         878503 non-null  float64
 4   Unit Selling Price (RMB/kg)  878503 non-null  float64
 5   Sale or Return               878503 non-null  object 
 6   Discount (Yes/No)            878503 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 46.9+ MB


In [13]:
# get item codes in df1 which present in df3
filt = df1['Item Code'].isin(df2['Item Code'])
df1 = df1.loc[filt]
df1.shape

(191, 4)

In [14]:
# get item codes in df3 which present in df1
filt = df2['Item Code'].isin(df1['Item Code'])
df2 = df2.loc[filt]
df2.shape

(737337, 7)

In [15]:
# getting a copy of df3 and assigning it to dff
dff = df2.copy().reset_index(drop=True)
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737337 entries, 0 to 737336
Data columns (total 7 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Date                         737337 non-null  object 
 1   Time                         737337 non-null  object 
 2   Item Code                    737337 non-null  int64  
 3   Quantity Sold (kilo)         737337 non-null  float64
 4   Unit Selling Price (RMB/kg)  737337 non-null  float64
 5   Sale or Return               737337 non-null  object 
 6   Discount (Yes/No)            737337 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 39.4+ MB


In [16]:
# merge df3 with df1 (item name, category name)
for i in range(dff.shape[0]):
    for j in range(df1.shape[0]):
        if dff['Item Code'].values[i] == df1['Item Code'].values[j]:
            dff.at[i, 'Item Name'] = df1['Item Name'].values[j]
            dff.at[i, 'Category Name'] = df1['Category Name'].values[j]
            break

# check data types, shape and null values
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737337 entries, 0 to 737336
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Date                         737337 non-null  object 
 1   Time                         737337 non-null  object 
 2   Item Code                    737337 non-null  int64  
 3   Quantity Sold (kilo)         737337 non-null  float64
 4   Unit Selling Price (RMB/kg)  737337 non-null  float64
 5   Sale or Return               737337 non-null  object 
 6   Discount (Yes/No)            737337 non-null  object 
 7   Item Name                    737337 non-null  object 
 8   Category Name                737337 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 50.6+ MB


In [17]:
# check unique items
dff['Item Name'].unique().shape[0]

191

In [18]:
# Get the column names to lists

first_columns = dff.columns[:-2].tolist()
last_two_columns = dff.columns[-2:].tolist()

# Rearrange the columns

order = last_two_columns + first_columns
dff = dff[order]

# check all the operations successful
dff.sample(3)

Unnamed: 0,Item Name,Category Name,Date,Time,Item Code,Quantity Sold (kilo),Unit Selling Price (RMB/kg),Sale or Return,Discount (Yes/No)
50887,needle mushroom,mushroom,2020-08-19,18:24:53.858,102900005116547,0.481,10.0,sale,No
508013,green line pepper,capsicum,2022-06-16,09:27:59.175,102900051004294,0.214,8.0,sale,No
594580,the local yellow youcai,flower,2022-10-22,13:58:47.592,102900011022849,0.682,4.0,sale,No


In [19]:
# check date range
dff['Date'].min(), dff['Date'].max()

('2020-07-01', '2023-06-30')

In [20]:
# select 2 year range of data
filt = (dff['Date'] >= '2021-01-01') & (dff['Date'] <= '2022-12-31')
dff = dff.loc[filt]
dff.shape

(475428, 9)

In [21]:
# save built dataset
dff.to_csv('../../datasets/built_dataset.csv', index=False)