# Lecture - Pandas basics

- pandas.Series
- pandas.DataFrame
- read_csv
- indexing
- plotting

## Pandas Series

- can create from dictionary
- can create from list
- can create from np.array

In [62]:
import pandas as pd

programs_dict = dict(AI = 26, Net = 38, Java = 30, UX = 28)

programs_series = pd.Series(programs_dict)

programs_series

AI      26
Net     38
Java    30
UX      28
dtype: int64

In [63]:
# extract values through indexing
print(f"{programs_series[0] = }")
print(f"{programs_series[1] = }")

print(f"{programs_series['UX'] = }")

# get keys
print(f"{programs_series.keys() =}")
print(f"{programs_series.keys()[0] =}")


programs_series[0] = 26
programs_series[1] = 38
programs_series['UX'] = 28
programs_series.keys() =Index(['AI', 'Net', 'Java', 'UX'], dtype='object')
programs_series.keys()[0] ='AI'


In [64]:
import random as rnd

# for reproducibility - gets same set of values
rnd.seed(1337)

dice_series = pd.Series([rnd.randint(1,6) for _ in range(10)])
dice_series
dice_series.head() #  inspects the five first rows

0    5
1    5
2    6
3    3
4    5
dtype: int64

In [65]:
dice_series

0    5
1    5
2    6
3    3
4    5
5    5
6    6
7    2
8    3
9    4
dtype: int64

In [66]:
print(f"{dice_series.min() = }")
print(f"{dice_series.argmin() = }") # gives index for min value
print(f"{dice_series.max() = }")
print(f"{dice_series.mean() = }") # average
print(f"{dice_series.median() = }") # Sort all values in order - pick the middle, if middle are 2 numbers calculate average of them

dice_series.min() = 2
dice_series.argmin() = 7
dice_series.max() = 6
dice_series.mean() = 4.4
dice_series.median() = 5.0


---
## DataFrame

- tabular data with rows and columns
- analog to 2D numpy arrays with flexible row indices and col names
- "specialized" dictionary with col names mapped to a Series object 

In [67]:
df_programs = pd.DataFrame(programs_series, columns=("Number_of_students",))
df_programs

Unnamed: 0,Number_of_students
AI,26
Net,38
Java,30
UX,28


In [68]:
# create 2 Series objects
students = pd.Series({"AI": 26, "NET": 38, "UX": 28, "Java": 30})
skills = pd.Series({"AI": "Python", "NET": "C#", "UX": "Figma", "Java": "Java"})


# create a DataFrame from 2 Series objects
df_programs = pd.DataFrame({"Students": students, "Skills": skills})
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


In [69]:
df_programs["Students"]

AI      26
NET     38
UX      28
Java    30
Name: Students, dtype: int64

In [70]:
df_programs["Students"].mean(), (26+30+38+28)/4

(30.5, 30.5)

In [71]:
median_student_number = df_programs["Students"].median()
print(f"Median students in the programs {df_programs.index.to_list()} is {median_student_number:.0f}")

Median students in the programs ['AI', 'NET', 'UX', 'Java'] is 29


In [72]:
df_programs["Skills"]

AI      Python
NET         C#
UX       Figma
Java      Java
Name: Skills, dtype: object

In [73]:
df_programs["Skills"][0], df_programs["Skills"]["AI"], df_programs["Skills"]["UX"],

('Python', 'Python', 'Figma')

## Indexers

- loc - slicing and indexing using explicit  index
- iloc - slicing and indexing using Python style index

In [74]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


In [75]:
df_programs.loc["AI"] # returns a series object

Students        26
Skills      Python
Name: AI, dtype: object

In [76]:
df_programs.loc["Java"]

Students      30
Skills      Java
Name: Java, dtype: object

In [78]:
df_programs.loc[["AI"],["UX"]]

KeyError: "None of [Index(['UX'], dtype='object')] are in the [columns]"

In [None]:
df_programs.iloc[1:3] # returns a DataFrame object

Unnamed: 0,Students,Skills
NET,38,C#
UX,28,Figma


## Masking

In [None]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


In [None]:
df_programs["Students"] >= 30

AI      False
NET      True
UX      False
Java     True
Name: Students, dtype: bool

In [None]:
# using masking to filter the DataFrame
df_programs_over_29 = df_programs[df_programs["Students"] >= 30]
df_programs_over_29

Unnamed: 0,Students,Skills
NET,38,C#
Java,30,Java


In [None]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


---
## Excel data

In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df =pd.read_excel("../Data/calories.xlsx")
df.head()


Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ
