# Catch Up Section 1 ~ Pandas I & II 

### Author: Sammie Smith, Summer 2025

In [None]:
import pandas as pd
import numpy as np

## Making a DataFrame

In [None]:

mascots = pd.DataFrame({"Course":["CS70", "CS61B", "D100"], "Mascot": ["penguin", "bee", "panda"]})

display(mascots)



## Changing the Index

In [None]:
mascots.index = ["zero", "one", "two"]
mascots

## Label Based Extraction & Index Based Extraction

In [None]:
d100 = mascots.loc["two"]
d100

In [None]:
d100 = mascots.iloc[2]
d100

In [None]:
cutest_mascot = d100['Mascot']
cutest_mascot

In [None]:
cutest_mascot = mascots.iloc[2]['Mascot']
cutest_mascot

In [None]:
cutest_mascot = mascots.loc["two", 'Mascot'] # using label, row index label, column name
cutest_mascot

In [None]:
cutest_mascot = mascots.iloc[2, 1] # using index: row index, column index
cutest_mascot

In [None]:
# this should error!! ask yourself why.... can you fix it?
mascots.iloc[2, 'Mascot']

In [None]:
# Slicing
# We want the first two values of the Courses column
computer_science_classes = mascots.iloc[0:2, 0]  # grab the 0th row (inclusive) to the 2nd row (exclusive)
computer_science_classes

## Extracting & Adding Columns

In [None]:
# get the Course column
mascots['Course']

In [None]:
# add a column rating cuteness
rating_series = pd.Series([7, 3, 10], index = ["zero", "one", "two"])
mascots['Rating'] = rating_series
mascots

In [None]:
# lets get rid of the Course column now.
# --> First via copying mascots
without_course = mascots[['Mascot', 'Rating']] # challenge... can you find a way to do this with iloc?
without_course

In [None]:
# Second via mutation (modifying mascots)
mascots = mascots.drop('Course', axis=1) # axis = 0 references row indices, axis = 1 references column indices
mascots

In [None]:
# restore the Course column

mascots = pd.DataFrame({"Course":["CS70", "CS61B", "D100"], "Mascot": ["penguin", "bee", "panda"], "Rating":[7,3,10]})
mascots

In [None]:
# Let's grab the computer science course names again, but this time without iloc or loc
computer_science_courses = mascots['Course'][0]
computer_science_courses 

In [None]:
# aww man that only gave us one course... how can I use slicing here to fix that?
computer_science_courses = mascots['Course'][0:2] 
computer_science_courses # TAADAAAA!!!!

#### Conceptual Challenge:  In your own words, please describe step by step what the above code does. How is it functionally different from how we got computer_science_courses using iloc? Hint.. what objects are you working with?

## Boolean Filtering

In [None]:
# I want the mascots that are SUPER cute. A mascot is SUPER cute if its Rating is >= 5.
# I need a way of checking if each row's Rating >= 5
#... First let's try manually labeling
is_super_cute = [True, False, True]

mascots[is_super_cute]

In [None]:
# What if I don't have the time to manually label? I can just insert a boolean condition!
mascots['Rating'] >= 5 # this gives me a boolean series...

In [None]:
# now I can use that boolean series to get the super cute rows
mascots[mascots['Rating'] >= 5]

#### Congrats! You just unlocked an INCREDIBLY useful feature!

In [None]:
# Now let's get courses that have a SUPER cute mascot AND are computer science
mascots[(mascots['Rating'] >= 5) & (mascots['Course'].str[0:2] == 'CS')] 

#### Challenge Question: Step by Step explain how we filtered for computer science courses above.

In [None]:
# In Boolean Filtering, use | for or, ~ for negation, ^ for xor

## Utility Functions

In [None]:
# What's the average cuteness of the mascots?
np.mean(mascots['Rating'])

In [None]:
# What about min, max?
np.min(mascots['Rating']), np.max(mascots['Rating'])

## Math with Columns

In [None]:
# All this confusing pandas syntax is getting me down. The world looks gray and ugly. I'm going to decrease all of my cuteness ratings by 2. 
# Lucky for me, there is a very fast (computationally inexpensive for you comp sci nerds!) way to do that.
mascots['Rating'] = mascots['Rating'] - 2
mascots

In [None]:
# I'm going crazy!! I know I can apply arithmetic operations to series.. let's try it on the Mascot column.
mascots['Mascot'] ** 2 # squaring the Mascots column

#### Challenge Question: Why did this error? Explain the error message.

## Useful Built-In Pandas Methods

In [None]:
# size - gets the number of rows * number of columns in a DataFrame or Series
mascots.size # 3 * 3 = 9

In [None]:
# shape - gets the (number of rows, number of columns)
mascots.shape

In [None]:
# describe - returns some basic statistics of the data
mascots.describe()

In [None]:
# describe looks different on a Series
mascots['Rating'].describe()

In [None]:
# sample - collect a random sample of the data points and return it as a DataFrame. Default is WITHOUT replacement
mascots.sample(2) # select two random points without replacement

In [None]:
# select two random points with replacement
mascots.sample(2, replace=True)

In [None]:
# value_counts - returns a Series telling you how many times each value appears in a series
mascots['Course'].value_counts()

In [None]:
# what do you think this will return? Take a guess!
mascots.value_counts()