# DataFrame Processes

**Objective:** Provide sample code for dataframe operations.

**Author:** Phil Busko

In [1]:
import os, sys
import pandas as PD

In [2]:
MODULE_PATH = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(MODULE_PATH, 'data')
LOGIC_PATH = os.path.join(MODULE_PATH, 'logic')
sys.path.append(LOGIC_PATH)
print(MODULE_PATH)

C:\Users\pbusko\Projects\QualityCuration\base_module


In [3]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import IPython.display as DS 
from ipywidgets import IntProgress

In [4]:
def DataframeToDicts(myDf):
    myLs = myDf.to_dict('records')
    for nd, d in enumerate(myLs):
        for k, v in d.items():
            if str(v) in ['nan', 'NaT', '<NA>'] : d[k] = None
    return myLs

### Prepare the Data

In [5]:
# use a sample csv file

brickFile = os.path.join(DATA_PATH, 'brickset_set_filter.csv')
brickDf = PD.read_csv(brickFile)
brickDf.head(3)

Unnamed: 0,set_no,name,price_store,price_new,price_used,rating_value,rating_votes,theme_group,theme,subtheme,main_tag,year,volume,weight,piece_cnt,minifig_cnt
0,858-1,Auto Engines,,,51.0,4.5,2.0,Technical,Technic,,,1980.0,,,242.0,0.0
1,1591-1,Danone Delivery Truck,,129.0,28.0,,,Modern day,Town,Special,,1980.0,,,40.0,0.0
2,1592-1,Town Square - Castle Scene,,1168.0,191.0,4.8,4.0,Modern day,Town,Classic,,1980.0,,,471.0,11.0


In [6]:
brickDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6181 entries, 0 to 6180
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   set_no        6181 non-null   object 
 1   name          6181 non-null   object 
 2   price_store   4767 non-null   float64
 3   price_new     5780 non-null   float64
 4   price_used    5489 non-null   float64
 5   rating_value  5005 non-null   float64
 6   rating_votes  5005 non-null   float64
 7   theme_group   6181 non-null   object 
 8   theme         6181 non-null   object 
 9   subtheme      4494 non-null   object 
 10  main_tag      0 non-null      float64
 11  year          6181 non-null   float64
 12  volume        2418 non-null   float64
 13  weight        2453 non-null   float64
 14  piece_cnt     6137 non-null   float64
 15  minifig_cnt   6181 non-null   float64
dtypes: float64(11), object(5)
memory usage: 772.8+ KB


### Sample Operations

In [7]:
# delete rows without data

fullDf = brickDf[brickDf['price_new'].notna() & brickDf['price_used'].notna()]
fullDf.shape

(5262, 16)

In [8]:
# drop columns

dropCols = ['rating_value', 'rating_votes', 'main_tag', 'subtheme', 'volume', 'weight']
trimDf = brickDf.drop(dropCols, axis=1, errors='ignore')
trimDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6181 entries, 0 to 6180
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   set_no       6181 non-null   object 
 1   name         6181 non-null   object 
 2   price_store  4767 non-null   float64
 3   price_new    5780 non-null   float64
 4   price_used   5489 non-null   float64
 5   theme_group  6181 non-null   object 
 6   theme        6181 non-null   object 
 7   year         6181 non-null   float64
 8   piece_cnt    6137 non-null   float64
 9   minifig_cnt  6181 non-null   float64
dtypes: float64(6), object(4)
memory usage: 483.0+ KB


In [9]:
# rename

renameDf = brickDf.rename({'set_no': 'SetNo'}, axis='columns')
renameDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6181 entries, 0 to 6180
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SetNo         6181 non-null   object 
 1   name          6181 non-null   object 
 2   price_store   4767 non-null   float64
 3   price_new     5780 non-null   float64
 4   price_used    5489 non-null   float64
 5   rating_value  5005 non-null   float64
 6   rating_votes  5005 non-null   float64
 7   theme_group   6181 non-null   object 
 8   theme         6181 non-null   object 
 9   subtheme      4494 non-null   object 
 10  main_tag      0 non-null      float64
 11  year          6181 non-null   float64
 12  volume        2418 non-null   float64
 13  weight        2453 non-null   float64
 14  piece_cnt     6137 non-null   float64
 15  minifig_cnt   6181 non-null   float64
dtypes: float64(11), object(5)
memory usage: 772.8+ KB


In [10]:
# create new composite column

aftermarketDf = fullDf[['set_no', 'name', 'price_new', 'price_used']]
aftermarketDf['aftermarket'] = aftermarketDf.apply(
                                lambda x: x['price_used'] - x['price_new'], axis=1)
aftermarketDf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aftermarketDf['aftermarket'] = aftermarketDf.apply(


Unnamed: 0,set_no,name,price_new,price_used,aftermarket
1,1591-1,Danone Delivery Truck,129.0,28.0,-101.0
2,1592-1,Town Square - Castle Scene,1168.0,191.0,-977.0
5,3604-1,Mark Monkey with his Fruit Stall,18.0,7.0,-11.0
6,3605-1,Ricky Racoon and his Scooter,18.0,4.0,-14.0
7,3634-1,Charlie Crow's Carry-All,90.0,13.0,-77.0


In [11]:
# convert to list of dicts

dataLs = DataframeToDicts(fullDf) 
print(len(dataLs))

5262


In [12]:
# loop through rows

for n, r in aftermarketDf.iterrows():
    if n % 2000 == 0:
        print(r)

set_no              4453-1
name           Goal Keeper
price_new              9.0
price_used             5.0
aftermarket           -4.0
Name: 2000, dtype: object
set_no                 3315-1
name           Olivia's House
price_new               101.0
price_used               42.0
aftermarket             -59.0
Name: 4000, dtype: object


In [13]:
# aggregate operation
# use the set no as the 100% complete column

summaryDf = trimDf.groupby(['theme_group']).agg(['count'])
summaryDf.columns = [' '.join(str(c) for c in col) for col in summaryDf.columns]
summaryDf = summaryDf.reset_index()[['theme_group', 'set_no count']]
#summaryDf.sort_values('set_no count', ascending=False)
summaryDf

Unnamed: 0,theme_group,set_no count
0,Action/Adventure,870
1,Constraction,405
2,Girls,432
3,Historical,432
4,Junior,212
5,Licensed,1313
6,Model making,496
7,Modern day,1351
8,Racing,224
9,Technical,446
