# Lecture - Pandas basics

- pandas.Series
- pandas.DataFrame
- read_csv
- indexing
- plotting

## Pandas Series

- can create from dictionary
- can create from list
- can create from np.array

In [20]:
import pandas as pd

programs_dict = dict(AI=26, NET=38, Java=30, UX=28)

programs_series = pd.Series(programs_dict)

programs_series


AI      26
NET     38
Java    30
UX      28
dtype: int64

In [21]:
# extract values using indexing
print(f"{programs_series[0] = }")
print(f"{programs_series[-1] = }")


print(f"{programs_series['UX'] = }")
# get keys

print(f"{programs_series.keys() = }")
print(f"{programs_series.keys()[0] = }")


programs_series[0] = 26
programs_series[-1] = 28
programs_series['UX'] = 28
programs_series.keys() = Index(['AI', 'NET', 'Java', 'UX'], dtype='object')
programs_series.keys()[0] = 'AI'


In [22]:
import random as rnd

rnd.seed(1337)  # Seed to get the same value everytime.

# Creating a series from a list
dice_series = pd.Series(
    [rnd.randint(1, 6) for _ in range(10)]
)  # Creating dice through list comprehension
dice_series.head()  # check the first five using head method / inspects the five (default) rows


0    5
1    5
2    6
3    3
4    5
dtype: int64

In [23]:
dice_series

0    5
1    5
2    6
3    3
4    5
5    5
6    6
7    2
8    3
9    4
dtype: int64

In [24]:
print(
    f"smallest value: {dice_series.min()} in spot: {dice_series.argmin()}"
)  # finds the smallest value, argmin shows at what spot it is in the series

print(
    f"largest value: {dice_series.max()} in spot: {dice_series.argmax()}"
)  # finds the smallest value, argmin shows at what spot it is in the series

print(
    f"average value: {dice_series.mean()}"
)  # finds the smallest value, argmin shows at what spot it is in the series

print(
    f"median value: {dice_series.median()}"
)  # sort all values in order - pick the middle one, if middle are 2 numbers calculate average of them


smallest value: 2 in spot: 7
largest value: 6 in spot: 2
average value: 4.4
median value: 5.0


---
## DataFrame

- tabular data with rows and columns
- analog to 2D numpy arrays with flexible row indices and column names.
- "specialized" dictionary with column name mapped to a Series object

In [25]:
df_programs_example = pd.DataFrame(programs_series, columns=("Number_of_students",)) 
df_programs_example 

Unnamed: 0,Number_of_students
AI,26
NET,38
Java,30
UX,28


In [26]:
# create 2 Series objects
students = pd.Series({"AI": 26, "NET": 38, "UX": 28, "Java": 30})
skills = pd.Series({"AI": "Python", "NET": "C#", "UX": "Figma", "Java": "Java"})

# create a DataFrame from 2 Series objects
df_programs = pd.DataFrame({"Students": students, "Skills": skills})

df_programs


Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


In [27]:
df_programs["Students"] # "indexing" kindof. Can get

AI      26
NET     38
UX      28
Java    30
Name: Students, dtype: int64

In [28]:
df_programs["Students"].mean(), (26+38+30+28)/4

(30.5, 30.5)

In [29]:
median_student_number = df_programs["Students"].median()

print(f"Median students in the programs {df_programs.index.to_list()} is {median_student_number:.0f}")

Median students in the programs ['AI', 'NET', 'UX', 'Java'] is 29


In [30]:
#df_programs[]

SyntaxError: invalid syntax (207922468.py, line 1)

## Indexers

* loc - slicing and indexing using explicit index
* iloc - slicing and indexing using Python-style indexing


In [None]:
df_programs

Unnamed: 0,Students,Skills
AI,26.0,Python
Java,30.0,Java
NET,38.0,
UX,28.0,Figma
net,,C#


In [33]:
df_programs.loc["AI"]

Students        26
Skills      Python
Name: AI, dtype: object

In [45]:
df_programs.loc[["AI", "UX"]]

Unnamed: 0,Students,Skills
AI,26,Python
UX,28,Figma


In [34]:
df_programs.iloc[1:3] # python style indexing.

Unnamed: 0,Students,Skills
NET,38,C#
UX,28,Figma
Java,30,Java


## Masking

In [35]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


In [46]:
df_programs["Students" >= 30]

TypeError: '>=' not supported between instances of 'str' and 'int'

In [39]:
#using masking to filter the DataFrame
df_programs_over_29 = df_programs[df_programs["Students"] >= 30]
df_programs_over_29

Unnamed: 0,Students,Skills
NET,38,C#
Java,30,Java


In [40]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java
