# Lecture pandas basics

- Two big data structures
- pandas.Series
- pandas.DataFrame
- read.csv
- indexing
- plotting

## pandas series

- can create from dictionary 
- can create from list 
- can create from np.array

In [1]:
import pandas as pd

programs_dict = dict(AI = 25, NET = 38, Java = 30, UX = 28)

programs_series = pd.Series(programs_dict) 

programs_series

AI      25
NET     38
Java    30
UX      28
dtype: int64

In [2]:
# extract values through indexing
print(f"{programs_series[0] = }")
print(f"{programs_series[-1] = }")

# get keys
print(f"{programs_series.keys() = }")
print(f"{programs_series.keys()[0] = }")

programs_series[0] = 25
programs_series[-1] = 28
programs_series.keys() = Index(['AI', 'NET', 'Java', 'UX'], dtype='object')
programs_series.keys()[0] = 'AI'


In [3]:
import random as rnd

rnd.seed(1337)

dice_series = pd.Series([rnd.randint(1, 6) for _ in range(10)])
dice_series.head() # shows the first 5 (by default) rows in a series


0    5
1    5
2    6
3    3
4    5
dtype: int64

In [4]:
print(f"{dice_series.min()}") #shows the lowest value in the series
print(f"{dice_series.argmin()}") #shows the index associated to the lowest value in the series
print(f"{dice_series.max()}") # displays max value in series
print(f"{dice_series.mean()}") # displays average value of the sereis
print(f"{dice_series.median()}") # sorts all values in order, picks the middle value

2
7
6
4.4
5.0


## Dataframe

- tabular data with rows and columns
- analog to 2D numpy arrays with flexible row indices and col names
- specialized dictionary with ol name mapped to a series object

In [5]:
# created a dataframe from a Series object
df_programs = pd.DataFrame(programs_series, columns = ("Number of students",))
df_programs

Unnamed: 0,Number of students
AI,25
NET,38
Java,30
UX,28


In [6]:
# create 2 series objects
students = pd.Series({"AI": 25, "NET": 38, "Java": 30, "UX": 28})
skills = pd.Series({"AI": "Python", "NET": "C#", "UX": "Figma", "Java": "Java"})

# create a DataFrame from 2 Series objects
df_program = pd.DataFrame({"Students": students, "Skills": skills})
df_program

Unnamed: 0,Students,Skills
AI,25,Python
Java,30,Java
NET,38,C#
UX,28,Figma


In [11]:
df_program["Skills"]


AI      Python
Java      Java
NET         C#
UX       Figma
Name: Skills, dtype: object

In [13]:
df_program["Skills"][0], df_program["Skills"]["AI"]

('Python', 'Python')

## Indexers

- loc - slicing and indexing using explicit index (the label)
- iloc - slicing and indexing using python style indexing 

In [15]:
df_program.loc["AI"] #access the entire row of "AI"

Students        25
Skills      Python
Name: AI, dtype: object

In [17]:
df_program.loc["Java"] #access the entire row of "Java" - series object

Students      30
Skills      Java
Name: Java, dtype: object

In [18]:
df_program.iloc[1:4] #used to take out rows(index), in example row 1 - 4 - dataframe object

Unnamed: 0,Students,Skills
Java,30,Java
NET,38,C#
UX,28,Figma


## Masking

- uses bool to extract value

In [20]:
df_program

Unnamed: 0,Students,Skills
AI,25,Python
Java,30,Java
NET,38,C#
UX,28,Figma


In [25]:
df_program["Students"] >= 30 #example of masking

AI      False
Java     True
NET      True
UX      False
Name: Students, dtype: bool

In [26]:
#using masking to filter the dataframe 
df_program[df_program["Students"] >= 30] 

Unnamed: 0,Students,Skills
Java,30,Java
NET,38,C#


## Excel data




In [28]:
import matplotlib.pyplot as plt
import seaborn as sns 

df = pd.read_excel("../Data/calories.xlsx")
df.head()

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ
