# Intro to Pandas
by Ryan Orsinger

## Introducing DataFrames

### Pandas DataFrames Basics - Part 1
- How to make pandas DataFrames from other Python collections
- Learning about your dataframe's properties and information
- Selecting a single column
- Adding new columns to a DataFrame
- Renaming columns
- Descriptive stats on the columns

In [78]:
import pandas as pd

In [42]:
# Creating a dataframe from a list of dictionaries
basket = [
    {"item": "mango", "quantity": 4, "price": 2.99},
    {"item": "bread", "quantity": 2, "price": 3.25},
    {"item": "juice", "quantity": 1, "price": 5.90},
    {"item": "orange", "quantity": 3, "price": 2.99},
    {"item": "lime", "quantity": 3, "price": 0.3},
]
basket

[{'item': 'mango', 'quantity': 4, 'price': 2.99},
 {'item': 'bread', 'quantity': 2, 'price': 3.25},
 {'item': 'juice', 'quantity': 1, 'price': 5.9},
 {'item': 'orange', 'quantity': 3, 'price': 2.99},
 {'item': 'lime', 'quantity': 3, 'price': 0.3}]

In [79]:
# With dataframes, our columns are our variables or features
# Each row represents a unique observation (usually)
df = pd.DataFrame(basket)
df

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3


In [80]:
# Creating a dataframe from a dictionary of lists
basket = {
    "item": ["mango", "bread", "juice", "orange", "lime"],
    "quantity": [4, 2, 1, 3, 3],
    "price": [2.99, 3.25, 5.90, 2.99, 0.30]
}
basket

{'item': ['mango', 'bread', 'juice', 'orange', 'lime'],
 'quantity': [4, 2, 1, 3, 3],
 'price': [2.99, 3.25, 5.9, 2.99, 0.3]}

In [81]:
pd.DataFrame(basket)

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3


In [82]:
# Creating a dataframe from a list of lists
example = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]

column_names = ["variable_a", "variable_b", "variable_c"]
row_names = ["observation_1", "observation_2", "observation_3"]

pd.DataFrame(example, columns=column_names, index=row_names)

Unnamed: 0,variable_a,variable_b,variable_c
observation_1,1,2,3
observation_2,4,5,6
observation_3,7,8,9


In [17]:
# Creating an empty dataframe
df = pd.DataFrame()

# Adding columns to a dataframe
# Any list-like data type can become a column
df["item"] = pd.Series(["Mango", "Bread", "Juice", "Orange", "Lime"]) #pandas series
df["quantity"] = [2, 2, 1, 3, 3] # list
df["price"] = (2.99, 3.25, 5.90, 2.99, 0.30) # tuple
df

Unnamed: 0,item,quantity,price
0,Mango,2,2.99
1,Bread,2,3.25
2,Juice,1,5.9
3,Orange,3,2.99
4,Lime,3,0.3


In [18]:
# .shape returns rows, columns
df.shape

(5, 3)

In [22]:
# Returns the rows
df.shape[0]

5

In [26]:
# Returns the number of columns
df.shape[1]

3

In [24]:
# len returns number of rows
len(df)

5

In [27]:
# size returns rows * columns
df.size
# 5 * 3 = 15

15

In [51]:
# Adding new columns to the dataframe
df["subtotal"] = df["quantity"] * df["price"]
df

Unnamed: 0_level_0,quantity,price,tax,subtotal
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mango,4,2.99,0.07,11.96
bread,2,3.25,0.07,6.5
juice,1,5.9,0.07,5.9
orange,3,2.99,0.07,8.97
lime,3,0.3,0.07,0.9


In [52]:
# set_index can overwrite the default index
df.set_index("item", inplace=True)
df

KeyError: "None of ['item'] are in the columns"

In [44]:
# Accessing the index values
df.index

Index(['mango', 'bread', 'juice', 'orange', 'lime'], dtype='object', name='item')

In [45]:
# We can also overwrite the index with a Series of equal length
df.index = df.index.str.lower()
df

Unnamed: 0_level_0,quantity,price
item,Unnamed: 1_level_1,Unnamed: 2_level_1
mango,4,2.99
bread,2,3.25
juice,1,5.9
orange,3,2.99
lime,3,0.3


In [46]:
# Accessing all the columns
df.columns

Index(['quantity', 'price'], dtype='object')

In [47]:
# Another example of creating a new column
df["tax"] = 0.07
df

Unnamed: 0_level_0,quantity,price,tax
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mango,4,2.99,0.07
bread,2,3.25,0.07
juice,1,5.9,0.07
orange,3,2.99,0.07
lime,3,0.3,0.07


In [53]:
# The "total cost" column does not exist, but this bracket syntax creates it.
df["total cost"] = df["subtotal"] + (df["subtotal"] * df["tax"])
df

Unnamed: 0_level_0,quantity,price,tax,subtotal,total cost
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mango,4,2.99,0.07,11.96,12.7972
bread,2,3.25,0.07,6.5,6.955
juice,1,5.9,0.07,5.9,6.313
orange,3,2.99,0.07,8.97,9.5979
lime,3,0.3,0.07,0.9,0.963


In [54]:
# Dot syntax also allows for calling an existing column
# Using .column syntax only works if the column exists already, has no spaces, and is not a dataframe method already
df.price

Unnamed: 0_level_0,price
item,Unnamed: 1_level_1
mango,2.99
bread,3.25
juice,5.9
orange,2.99
lime,0.3


In [55]:
df.subtotal

Unnamed: 0_level_0,subtotal
item,Unnamed: 1_level_1
mango,11.96
bread,6.5
juice,5.9
orange,8.97
lime,0.9


In [56]:
# .dtypes outputs the datatypes of all columns in the dataframe
df.dtypes

Unnamed: 0,0
quantity,int64
price,float64
tax,float64
subtotal,float64
total cost,float64


In [57]:
# .info returns datatype and non-null count
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, mango to lime
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   quantity    5 non-null      int64  
 1   price       5 non-null      float64
 2   tax         5 non-null      float64
 3   subtotal    5 non-null      float64
 4   total cost  5 non-null      float64
dtypes: float64(4), int64(1)
memory usage: 412.0+ bytes


In [58]:
df.price.describe()
# 25% 1st quarter
# 75% 3rd querter
# 50% median
# std = standart deviation

Unnamed: 0,price
count,5.0
mean,3.086
std,1.982783
min,0.3
25%,2.99
50%,2.99
75%,3.25
max,5.9


In [59]:
df.price

Unnamed: 0_level_0,price
item,Unnamed: 1_level_1
mango,2.99
bread,3.25
juice,5.9
orange,2.99
lime,0.3


In [60]:
# Show descriptive stats for numeric columns
df.describe()

Unnamed: 0,quantity,price,tax,subtotal,total cost
count,5.0,5.0,5.0,5.0,5.0
mean,2.6,3.086,0.07,6.846,7.32522
std,1.140175,1.982783,0.0,4.093712,4.380272
min,1.0,0.3,0.07,0.9,0.963
25%,2.0,2.99,0.07,5.9,6.313
50%,3.0,2.99,0.07,6.5,6.955
75%,3.0,3.25,0.07,8.97,9.5979
max,4.0,5.9,0.07,11.96,12.7972


In [61]:
# A column in a dataframe is a series
type(df.quantity)

In [62]:
# .value_counts returns a series
df.quantity.value_counts()

Unnamed: 0_level_0,count
quantity,Unnamed: 1_level_1
3,2
4,1
2,1
1,1


In [63]:
# Aggregate functions can run on all the numeric values in the dataframe
df.mean()

Unnamed: 0,0
quantity,2.6
price,3.086
tax,0.07
subtotal,6.846
total cost,7.32522


In [64]:
# Obtain the median of all numeric columns
df.median()

Unnamed: 0,0
quantity,3.0
price,2.99
tax,0.07
subtotal,6.5
total cost,6.955


In [65]:
# Standard deviation of all the numeric columns
df.std()

Unnamed: 0,0
quantity,1.140175
price,1.982783
tax,0.0
subtotal,4.093712
total cost,4.380272


In [66]:
# The square bracket syntax can be used to access column names with spaces
# Same with column names that are methods on the dataframe object
# Recommend avoiding column names with spaces, removing spaces when you encounter them
# Recommend avoiding naming columns after dataframe method
df["shape"] = ["round", "loaf", "jug", "round", "round"]
df

Unnamed: 0_level_0,quantity,price,tax,subtotal,total cost,shape
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mango,4,2.99,0.07,11.96,12.7972,round
bread,2,3.25,0.07,6.5,6.955,loaf
juice,1,5.9,0.07,5.9,6.313,jug
orange,3,2.99,0.07,8.97,9.5979,round
lime,3,0.3,0.07,0.9,0.963,round


In [67]:
# Remember that the .shape attribute shows # rows, #columns
df.shape

(5, 6)

In [68]:
# The bracket quote syntax returns the column
df["shape"]

Unnamed: 0_level_0,shape
item,Unnamed: 1_level_1
mango,round
bread,loaf
juice,jug
orange,round
lime,round


In [69]:
# Naming columns the same as built-in dataframe methods and spaces in column names are not helpful
# .rename allows for renaming columns in a dataframe using a dictionary
df.rename(columns={"shape": "item_shape", "total cost": "total"}, inplace=True)
df

Unnamed: 0_level_0,quantity,price,tax,subtotal,total,item_shape
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mango,4,2.99,0.07,11.96,12.7972,round
bread,2,3.25,0.07,6.5,6.955,loaf
juice,1,5.9,0.07,5.9,6.313,jug
orange,3,2.99,0.07,8.97,9.5979,round
lime,3,0.3,0.07,0.9,0.963,round


## Exercises
- Assign dataframe named `df` out of the following list of dictionaries.
```
items = [
    {"item name": "USB cable", "price": "$10.99", "type": "USB C to USB C"},
    {"item name": "USB cable", "price": "$10.99", "type": "USB A to USB C"},
    {"item name": "Batteries", "price": "$9.99", "type": "AA"},
    {"item name": "Batteries", "price": "$8.99", "type": "AAA"},
    {"item name": "Mouse", "price": "$12.99", "type": "Wireless USB"},
]
```
- Rename the `item name` column to `item_name`
- Add a column named `units_sold` to this dataframe and assign the values `[41, 113, 54, 35, 22]`
- Convert the price column into a float (hint: remove non-numeric characters before attempting to convert the data type to a float)
- Create a new column named `total_revenue` that holds the `price` column times the `units_sold` column
- Round the `total_revenue` column values to the nearest whole number. (hint: search or consult the pandas documentation

In [70]:
import pandas as pd

In [71]:
# Assign dataframe named `df` out of the following list of dictionaries
items = [
  {"item name": "USB cable", "price": "$10.99", "type": "USB C to USB C"},
  {"item name": "USB cable", "price": "$10.99", "type": "USB A to USB C"},
  {"item name": "Batteries", "price": "$9.99", "type": "AA"},
  {"item name": "Batteries", "price": "$8.99", "type": "AAA"},
  {"item name": "Mouse", "price": "$12.99", "type": "Wireless USB"},
]
df = pd.DataFrame(items)
df

Unnamed: 0,item name,price,type
0,USB cable,$10.99,USB C to USB C
1,USB cable,$10.99,USB A to USB C
2,Batteries,$9.99,AA
3,Batteries,$8.99,AAA
4,Mouse,$12.99,Wireless USB


In [72]:
# Rename the item name column to item_name
df.rename(columns={"item name": "item_name"}, inplace=True)
df

Unnamed: 0,item_name,price,type
0,USB cable,$10.99,USB C to USB C
1,USB cable,$10.99,USB A to USB C
2,Batteries,$9.99,AA
3,Batteries,$8.99,AAA
4,Mouse,$12.99,Wireless USB


In [73]:
# Add a column named units_sold to this dataframe and assign the values [41, 113, 54, 35, 22]
df["units_sold"] = [41, 113, 54, 35, 22]
df

Unnamed: 0,item_name,price,type,units_sold
0,USB cable,$10.99,USB C to USB C,41
1,USB cable,$10.99,USB A to USB C,113
2,Batteries,$9.99,AA,54
3,Batteries,$8.99,AAA,35
4,Mouse,$12.99,Wireless USB,22


In [74]:
# Convert the price column into a float data type. First replace $ and then convert to folat.
df["price"] = df["price"].str.replace("$", "")
df["price"] = df["price"].astype(float)
df

Unnamed: 0,item_name,price,type,units_sold
0,USB cable,10.99,USB C to USB C,41
1,USB cable,10.99,USB A to USB C,113
2,Batteries,9.99,AA,54
3,Batteries,8.99,AAA,35
4,Mouse,12.99,Wireless USB,22


In [75]:
# Create a new column named "total_revenue" that holds the `price` column times the `units_sold` column
df["total_revenue"] = df["price"] * df["units_sold"]
df

Unnamed: 0,item_name,price,type,units_sold,total_revenue
0,USB cable,10.99,USB C to USB C,41,450.59
1,USB cable,10.99,USB A to USB C,113,1241.87
2,Batteries,9.99,AA,54,539.46
3,Batteries,8.99,AAA,35,314.65
4,Mouse,12.99,Wireless USB,22,285.78


In [76]:
# Round the total_revenue column values to the nearest whole number.
df["total_revenue"] = df["total_revenue"].round()
df

Unnamed: 0,item_name,price,type,units_sold,total_revenue
0,USB cable,10.99,USB C to USB C,41,451.0
1,USB cable,10.99,USB A to USB C,113,1242.0
2,Batteries,9.99,AA,54,539.0
3,Batteries,8.99,AAA,35,315.0
4,Mouse,12.99,Wireless USB,22,286.0


In [93]:
df.to_csv("output.csv", index=False)
df

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3


In [92]:
# export file in excel
df.to_excel("output.xlsx", index=False)
df

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3
