In [1]:
# Introduction to DataFrames in Pandas
import pandas as pd

In [7]:
# creating a dataframe from a list of dictionaries
basket = [
        {"item": "mango", "quantity":4, "price": 2.99},
        {"item": "bread", "quantity":2, "price": 3.25},
        {"item": "juice", "quantity":1, "price": 5.90},
        {"item": "orange", "quantity":3, "price": 2.99},
        {"item": "lime", "quantity":3, "price": 0.3}
]
basket

[{'item': 'mango', 'quantity': 4, 'price': 2.99},
 {'item': 'bread', 'quantity': 2, 'price': 3.25},
 {'item': 'juice', 'quantity': 1, 'price': 5.9},
 {'item': 'orange', 'quantity': 3, 'price': 2.99},
 {'item': 'lime', 'quantity': 3, 'price': 0.3}]

In [8]:
# with dataframes our columns are our variables or features
# each row represents a unique observation
# they call rows 'observations'
df =  pd.DataFrame(basket)
df

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3


In [9]:
# creating a DataFrame from a dictionary of lists
basket = {
    "item": ["mango", "pear", "bread", "juice", "orange", "lime"],
    "quantity": [4, 5, 2, 1, 3, 3],
    "price": [2.99, 5.67, 3.25, 5.90, 2.99, 0.30]
}
basket

{'item': ['mango', 'pear', 'bread', 'juice', 'orange', 'lime'],
 'quantity': [4, 5, 2, 1, 3, 3],
 'price': [2.99, 5.67, 3.25, 5.9, 2.99, 0.3]}

In [10]:
pd.DataFrame(basket)

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,pear,5,5.67
2,bread,2,3.25
3,juice,1,5.9
4,orange,3,2.99
5,lime,3,0.3


In [11]:
# these first two examples are showing the two different ways that data could be stored 
# and how they both can come to a similar conclusion using the DataFrame method in Pandas


In [15]:
# creating a DataFrame from a list of lists
# this demonstraties how we can customize/label our DataFrame so that 
# it is easier to read and is more palatable (they are optional arguments)
example = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]

column_names = ["variable_a", "variable_b", "variable_c"]
row_names = ["observation_1", "observation_2", "observation_3"]

pd.DataFrame(example, columns=column_names, index=row_names)

Unnamed: 0,variable_a,variable_b,variable_c
observation_1,1,2,3
observation_2,4,5,6
observation_3,7,8,9


In [6]:
# creating an empty DataFrame
# and then add onto it - interesting that it uses series to fill the columns of the DataFrame
# this is important to remember that the columns correspond to a Series Pandas 'Object'
df = pd.DataFrame()

# we are going to add columns to the DataFrame
# any list-like data-type can become a column
df["item"] = pd.Series(["mango", "pear", "bread", "juice", "orange", "lime"])
df["quantity"] = pd.Series([4, 5, 2, 1, 3, 3])
df["price"] = pd.Series([2.99, 5.67, 3.25, 5.90, 2.99, 0.30])
df

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,pear,5,5.67
2,bread,2,3.25
3,juice,1,5.9
4,orange,3,2.99
5,lime,3,0.3


In [26]:
# this is the count for the rows and columns
# the item that is returned is called a tuple
df.shape

(6, 4)

In [21]:
# this returns number of rows
df.shape[0]

6

In [22]:
# this returns the number of columns
df.shape[1]

3

In [23]:
# returns the number of rows
len(df)

6

In [24]:
# this returns the rows * columns
df.size

18

In [9]:
# adding new columns to the DataFrame
df["subtotal"] = df["quantity"] * df["price"]
df

Unnamed: 0,item,quantity,price,tax,subtotal
0,mango,4,2.99,0.05,11.96
1,pear,5,5.67,0.05,28.35
2,bread,2,3.25,0.05,6.5
3,juice,1,5.9,0.05,5.9
4,orange,3,2.99,0.05,8.97
5,lime,3,0.3,0.05,0.9


In [10]:
# set_index can overwrite the default index
df.set_index("item", inplace=True)
df

Unnamed: 0_level_0,quantity,price,tax,subtotal
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mango,4,2.99,0.05,11.96
pear,5,5.67,0.05,28.35
bread,2,3.25,0.05,6.5
juice,1,5.9,0.05,5.9
orange,3,2.99,0.05,8.97
lime,3,0.3,0.05,0.9


In [29]:
# accessing the index values 
df.index

Index(['mango', 'pear', 'bread', 'juice', 'orange', 'lime'], dtype='object', name='item')

In [11]:
# we can also overwrite the index with a series of equal length
df.index = df.index.str.lower()
df

Unnamed: 0_level_0,quantity,price,tax,subtotal
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mango,4,2.99,0.05,11.96
pear,5,5.67,0.05,28.35
bread,2,3.25,0.05,6.5
juice,1,5.9,0.05,5.9
orange,3,2.99,0.05,8.97
lime,3,0.3,0.05,0.9


In [31]:
# getting all of the columns of the DataFrame
df.columns

Index(['quantity', 'price', 'subtotal'], dtype='object')

In [12]:
df["tax"] = 0.05
df

Unnamed: 0_level_0,quantity,price,tax,subtotal
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mango,4,2.99,0.05,11.96
pear,5,5.67,0.05,28.35
bread,2,3.25,0.05,6.5
juice,1,5.9,0.05,5.9
orange,3,2.99,0.05,8.97
lime,3,0.3,0.05,0.9


In [13]:
# creating the 'total' column
df["total"] = df["subtotal"] + df["subtotal"] * df["tax"]
df

Unnamed: 0_level_0,quantity,price,tax,subtotal,total
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mango,4,2.99,0.05,11.96,12.558
pear,5,5.67,0.05,28.35,29.7675
bread,2,3.25,0.05,6.5,6.825
juice,1,5.9,0.05,5.9,6.195
orange,3,2.99,0.05,8.97,9.4185
lime,3,0.3,0.05,0.9,0.945


In [35]:
# dot syntax also allows for the calling of an existing column
df.price

item
mango     2.99
pear      5.67
bread     3.25
juice     5.90
orange    2.99
lime      0.30
Name: price, dtype: float64

In [36]:
# .dtypes outputs the datatypes fopr all the columns in the DataFrame
df.dtypes

quantity      int64
price       float64
subtotal    float64
tax         float64
total       float64
dtype: object

In [38]:
# .info returns the outputs the datatypes of all the columns in the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, mango to lime
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   quantity  6 non-null      int64  
 1   price     6 non-null      float64
 2   subtotal  6 non-null      float64
 3   tax       6 non-null      float64
 4   total     6 non-null      float64
dtypes: float64(4), int64(1)
memory usage: 460.0+ bytes


In [39]:
df.price.describe()

count    6.000000
mean     3.516667
std      2.063489
min      0.300000
25%      2.990000
50%      3.120000
75%      5.065000
max      5.900000
Name: price, dtype: float64

In [40]:
# show dsecriptive stats for numeric columns
df.describe()

Unnamed: 0,quantity,price,subtotal,tax,total
count,6.0,6.0,6.0,6.0,6.0
mean,3.0,3.516667,10.43,0.05,10.9515
std,1.414214,2.063489,9.511946,7.601177e-18,9.987543
min,1.0,0.3,0.9,0.05,0.945
25%,2.25,2.99,6.05,0.05,6.3525
50%,3.0,3.12,7.735,0.05,8.12175
75%,3.75,5.065,11.2125,0.05,11.773125
max,5.0,5.9,28.35,0.05,29.7675


In [3]:
# renaming columns - part 2

In [16]:
# avoid using column names with spaces and remove spaces when encountering them
# avoid naming columns after a DataFrame method
df["shape"] = ["round", "pear", "loaf", "jug", "round", "round"]
df

Unnamed: 0_level_0,quantity,price,tax,subtotal,total,shape
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mango,4,2.99,0.05,11.96,12.558,round
pear,5,5.67,0.05,28.35,29.7675,pear
bread,2,3.25,0.05,6.5,6.825,loaf
juice,1,5.9,0.05,5.9,6.195,jug
orange,3,2.99,0.05,8.97,9.4185,round
lime,3,0.3,0.05,0.9,0.945,round


In [17]:
df.shape

(6, 6)

In [18]:
df["shape"]

item
mango     round
pear       pear
bread      loaf
juice       jug
orange    round
lime      round
Name: shape, dtype: object

In [19]:
# naming columns the same as built-in DataFrame methods and spces in column names are not helpful
# .rename allows for renaming columns in a DataFrrame using a dictionary
df.rename(columns={"shape": "item_shape"}, inplace=True)
df

Unnamed: 0_level_0,quantity,price,tax,subtotal,total,item_shape
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mango,4,2.99,0.05,11.96,12.558,round
pear,5,5.67,0.05,28.35,29.7675,pear
bread,2,3.25,0.05,6.5,6.825,loaf
juice,1,5.9,0.05,5.9,6.195,jug
orange,3,2.99,0.05,8.97,9.4185,round
lime,3,0.3,0.05,0.9,0.945,round


In [21]:
# now we don't just need to use bracket notation to get access to the items in this column
df.item_shape

item
mango     round
pear       pear
bread      loaf
juice       jug
orange    round
lime      round
Name: item_shape, dtype: object

In [28]:
# Challenge
items = [
    {"item name": "USB cable", "price": "$10.99", "type": "USB C to USB C"},
    {"item name": "USB cable", "price": "$10.99", "type": "USB A to USB C"},
    {"item name": "Batteries", "price": "$9.99", "type": "AA"},
    {"item name": "Batteries", "price": "$8.99", "type": "AAA"},
    {"item name": "Mouse", "price": "$12.99", "type": "Wireless USB"}
]

# turing it into a DataFrame
items = pd.DataFrame(items)
items

Unnamed: 0,item name,price,type
0,USB cable,$10.99,USB C to USB C
1,USB cable,$10.99,USB A to USB C
2,Batteries,$9.99,AA
3,Batteries,$8.99,AAA
4,Mouse,$12.99,Wireless USB


In [29]:
# renaming the comlumn name to remove the spaces
items.rename(columns={"item name": "item_name"}, inplace=True)
items

Unnamed: 0,item_name,price,type
0,USB cable,$10.99,USB C to USB C
1,USB cable,$10.99,USB A to USB C
2,Batteries,$9.99,AA
3,Batteries,$8.99,AAA
4,Mouse,$12.99,Wireless USB


In [30]:
# adding column name 'units_sold'
items["units_sold"] = ["41", "113", "54", "35", "22"]
items

Unnamed: 0,item_name,price,type,units_sold
0,USB cable,$10.99,USB C to USB C,41
1,USB cable,$10.99,USB A to USB C,113
2,Batteries,$9.99,AA,54
3,Batteries,$8.99,AAA,35
4,Mouse,$12.99,Wireless USB,22


In [55]:
# turning the price column into type 'float'
# getting rid of unwated characters that might prevent the conversion

# this is here just to handle the error that it throws when this code-block is ran
try:
    items["price"] = items["price"].str.replace("$", "")
except AttributeError:
    return None
return items["price"]

SyntaxError: 'return' outside function (3641789898.py, line 8)

In [41]:
# here we can confirm that the transition was a success 
# because we are seeing that error above and because we are getting 
# dtype: float64
items["price"] = items["price"].astype(float)
items["price"].describe()

count     5.00000
mean     10.79000
std       1.48324
min       8.99000
25%       9.99000
50%      10.99000
75%      10.99000
max      12.99000
Name: price, dtype: float64

In [64]:
# can't multiply the float by an int for some reason
# so we are going to change one of the datatypes
temp_data = items["units_sold"].astype(float)
temp =  (items["price"]) * (temp_data)
round(temp, 1)

0     450.6
1    1241.9
2     539.5
3     314.7
4     285.8
dtype: float64

In [65]:
items["total_revenue"] = round(temp, 1)
items

Unnamed: 0,item_name,price,type,units_sold,total_revenue
0,USB cable,10.99,USB C to USB C,41,450.6
1,USB cable,10.99,USB A to USB C,113,1241.9
2,Batteries,9.99,AA,54,539.5
3,Batteries,8.99,AAA,35,314.7
4,Mouse,12.99,Wireless USB,22,285.8
