# Catch Up Section 1 ~ Pandas I & II 

### Author: Sammie Smith, Summer 2025

In [68]:
import pandas as pd
import numpy as np

## Making a DataFrame

In [69]:

mascots = pd.DataFrame({"Course":["CS70", "CS61B", "D100"], "Mascot": ["penguin", "bee", "panda"]})

display(mascots)



Unnamed: 0,Course,Mascot
0,CS70,penguin
1,CS61B,bee
2,D100,panda


## Changing the Index

In [70]:
mascots.index = ["zero", "one", "two"]
mascots

Unnamed: 0,Course,Mascot
zero,CS70,penguin
one,CS61B,bee
two,D100,panda


## Label Based Extraction & Index Based Extraction

In [71]:
d100 = mascots.loc["two"]
d100

Course     D100
Mascot    panda
Name: two, dtype: object

In [72]:
d100 = mascots.iloc[2]
d100

Course     D100
Mascot    panda
Name: two, dtype: object

In [73]:
cutest_mascot = d100['Mascot']
cutest_mascot

'panda'

In [74]:
cutest_mascot = mascots.iloc[2]['Mascot']
cutest_mascot

'panda'

In [75]:
cutest_mascot = mascots.loc["two", 'Mascot'] # using label, row index label, column name
cutest_mascot

'panda'

In [76]:
cutest_mascot = mascots.iloc[2, 1] # using index: row index, column index
cutest_mascot

'panda'

In [77]:
# this should error!! ask yourself why.... can you fix it?
mascots.iloc[2, 'Mascot']

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [78]:
# Slicing
# We want the first two values of the Courses column
computer_science_classes = mascots.iloc[0:2, 0]  # grab the 0th row (inclusive) to the 2nd row (exclusive)
computer_science_classes

zero     CS70
one     CS61B
Name: Course, dtype: object

## Extracting & Adding Columns

In [79]:
# get the Course column
mascots['Course']

zero     CS70
one     CS61B
two      D100
Name: Course, dtype: object

In [80]:
# add a column rating cuteness
rating_series = pd.Series([7, 3, 10], index = ["zero", "one", "two"])
mascots['Rating'] = rating_series
mascots

Unnamed: 0,Course,Mascot,Rating
zero,CS70,penguin,7
one,CS61B,bee,3
two,D100,panda,10


In [81]:
# lets get rid of the Course column now.
# --> First via copying mascots
without_course = mascots[['Mascot', 'Rating']] # challenge... can you find a way to do this with iloc?
without_course

Unnamed: 0,Mascot,Rating
zero,penguin,7
one,bee,3
two,panda,10


In [82]:
# Second via mutation (modifying mascots)
mascots = mascots.drop('Course', axis=1) # axis = 0 references row indices, axis = 1 references column indices
mascots

Unnamed: 0,Mascot,Rating
zero,penguin,7
one,bee,3
two,panda,10


In [83]:
# restore the Course column

mascots = pd.DataFrame({"Course":["CS70", "CS61B", "D100"], "Mascot": ["penguin", "bee", "panda"], "Rating":[7,3,10]})
mascots

Unnamed: 0,Course,Mascot,Rating
0,CS70,penguin,7
1,CS61B,bee,3
2,D100,panda,10


In [84]:
# Let's grab the computer science course names again, but this time without iloc or loc
computer_science_courses = mascots['Course'][0]
computer_science_courses 

'CS70'

In [85]:
# aww man that only gave us one course... how can I use slicing here to fix that?
computer_science_courses = mascots['Course'][0:2] 
computer_science_courses # TAADAAAA!!!!

0     CS70
1    CS61B
Name: Course, dtype: object

#### Conceptual Challenge:  In your own words, please describe step by step what the above code does. How is it functionally different from how we got computer_science_courses using iloc? Hint.. what objects are you working with?

## Boolean Filtering

In [86]:
# I want the mascots that are SUPER cute. A mascot is SUPER cute if its Rating is >= 5.
# I need a way of checking if each row's Rating >= 5
#... First let's try manually labeling
is_super_cute = [True, False, True]

mascots[is_super_cute]

Unnamed: 0,Course,Mascot,Rating
0,CS70,penguin,7
2,D100,panda,10


In [87]:
# What if I don't have the time to manually label? I can just insert a boolean condition!
mascots['Rating'] >= 5 # this gives me a boolean series...

0     True
1    False
2     True
Name: Rating, dtype: bool

In [88]:
# now I can use that boolean series to get the super cute rows
mascots[mascots['Rating'] >= 5]

Unnamed: 0,Course,Mascot,Rating
0,CS70,penguin,7
2,D100,panda,10


#### Congrats! You just unlocked an INCREDIBLY useful feature!

In [89]:
# Now let's get courses that have a SUPER cute mascot AND are computer science
mascots[(mascots['Rating'] >= 5) & (mascots['Course'].str[0:2] == 'CS')] 

Unnamed: 0,Course,Mascot,Rating
0,CS70,penguin,7


#### Challenge Question: Step by Step explain how we filtered for computer science courses above.

In [90]:
# In Boolean Filtering, use | for or, ~ for negation, ^ for xor

## Utility Functions

In [91]:
# What's the average cuteness of the mascots?
np.mean(mascots['Rating'])

6.666666666666667

In [92]:
# What about min, max?
np.min(mascots['Rating']), np.max(mascots['Rating'])

(3, 10)

## Math with Columns

In [94]:
# All this confusing pandas syntax is getting me down. The world looks gray and ugly. I'm going to decrease all of my cuteness ratings by 2. 
# Lucky for me, there is a very fast (computationally inexpensive for you comp sci nerds!) way to do that.
mascots['Rating'] = mascots['Rating'] - 2
mascots

Unnamed: 0,Course,Mascot,Rating
0,CS70,penguin,5
1,CS61B,bee,1
2,D100,panda,8


In [95]:
# I'm going crazy!! I know I can apply arithmetic operations to series.. let's try it on the Mascot column.
mascots['Mascot'] ** 2 # squaring the Mascots column

TypeError: unsupported operand type(s) for ** or pow(): 'str' and 'int'

#### Challenge Question: Why did this error? Explain the error message.

## Useful Built-In Pandas Methods

In [102]:
# size - gets the number of rows * number of columns in a DataFrame or Series
mascots.size # 3 * 3 = 9

9

In [103]:
# shape - gets the (number of rows, number of columns)
mascots.shape

(3, 3)

In [104]:
# describe - returns some basic statistics of the data
mascots.describe()

Unnamed: 0,Rating
count,3.0
mean,4.666667
std,3.511885
min,1.0
25%,3.0
50%,5.0
75%,6.5
max,8.0


In [105]:
# describe looks different on a Series
mascots['Rating'].describe()

count    3.000000
mean     4.666667
std      3.511885
min      1.000000
25%      3.000000
50%      5.000000
75%      6.500000
max      8.000000
Name: Rating, dtype: float64

In [106]:
# sample - collect a random sample of the data points and return it as a DataFrame. Default is WITHOUT replacement
mascots.sample(2) # select two random points without replacement

Unnamed: 0,Course,Mascot,Rating
2,D100,panda,8
0,CS70,penguin,5


In [111]:
# select two random points with replacement
mascots.sample(2, replace=True)

Unnamed: 0,Course,Mascot,Rating
2,D100,panda,8
2,D100,panda,8


In [112]:
# value_counts - returns a Series telling you how many times each value appears in a series
mascots['Course'].value_counts()

Course
CS70     1
CS61B    1
D100     1
Name: count, dtype: int64

In [113]:
# what do you think this will return? Take a guess!
mascots.value_counts()

Course  Mascot   Rating
CS61B   bee      1         1
CS70    penguin  5         1
D100    panda    8         1
Name: count, dtype: int64