# pandas basics

pandas basics, nothing fancy + little hedgehog data 

## 1. import pandas library

In [244]:
# pip install pandas

# imports
import pandas as pd

## 2. create simple dataframe

In [246]:
# create simple dataframe about hedgehogs
data = {
    'name': ['Peanut', 'Brownie', 'Tofu', 'Fifi', 'Spike'],
    'age': [1, 4, 8, 10, 2],
    'spikes': [3000, 3500, 4000, 6000, 3000],
    'color': ['Brown', 'Brown', 'Albino', 'Black-White', 'Grey'],
    'weight': [350, 420, 390, 360, 410],
    'friendly': [True, True, False, True, True]
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,spikes,color,weight,friendly
0,Peanut,1,3000,Brown,350,True
1,Brownie,4,3500,Brown,420,True
2,Tofu,8,4000,Albino,390,False
3,Fifi,10,6000,Black-White,360,True
4,Spike,2,3000,Grey,410,True


## 3. save data

In [248]:
# save data to csv
df.to_csv('data/hedgehogs.csv', index=False)

# save data to excel
df.to_excel('data/hedgehogs.xlsx', index=False)

# save data to JSON
df.to_json('data/hedgehogs.json', orient='records')

print('dataframe saved in 3 ways')

dataframe saved in 3 ways


## 4. read data

In [250]:
# read csv
df_csv = pd.read_csv('data/hedgehogs.csv')
print(df_csv, '\n...and others looks the same')

# read excel
# df_excel = pd.read_excel('data/hedgehogs.xlsx')
# print(df_excel)

# read JSON
# df_json = pd.read_json('data/hedgehogs.json', orient='records')
# print(df_json)

      name  age  spikes        color  weight  friendly
0   Peanut    1    3000        Brown     350      True
1  Brownie    4    3500        Brown     420      True
2     Tofu    8    4000       Albino     390     False
3     Fifi   10    6000  Black-White     360      True
4    Spike    2    3000         Grey     410      True 
...and others looks the same


## 4. basic information about dataframe

In [252]:
# dataframes basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      5 non-null      object
 1   age       5 non-null      int64 
 2   spikes    5 non-null      int64 
 3   color     5 non-null      object
 4   weight    5 non-null      int64 
 5   friendly  5 non-null      bool  
dtypes: bool(1), int64(3), object(2)
memory usage: 337.0+ bytes


In [253]:
# dataframes basic statistics
df.describe()

Unnamed: 0,age,spikes,weight
count,5.0,5.0,5.0
mean,5.0,3900.0,386.0
std,3.872983,1244.98996,30.495901
min,1.0,3000.0,350.0
25%,2.0,3000.0,360.0
50%,4.0,3500.0,390.0
75%,8.0,4000.0,410.0
max,10.0,6000.0,420.0


In [254]:
# column names
df.columns

Index(['name', 'age', 'spikes', 'color', 'weight', 'friendly'], dtype='object')

In [255]:
# spesific column
df['name']

0     Peanut
1    Brownie
2       Tofu
3       Fifi
4      Spike
Name: name, dtype: object

In [256]:
# first row
df.iloc[0]  

name        Peanut
age              1
spikes        3000
color        Brown
weight         350
friendly      True
Name: 0, dtype: object

In [257]:
# row with spesific data
df[df['age'] > 10]

Unnamed: 0,name,age,spikes,color,weight,friendly


In [258]:
# size
df.size

30

In [259]:
# shape
df.shape

(5, 6)

## 5. adding and deleting data

In [261]:
# adding a new column
df['ears'] = ['big', 'small', 'small', 'big', 'small']
df

Unnamed: 0,name,age,spikes,color,weight,friendly,ears
0,Peanut,1,3000,Brown,350,True,big
1,Brownie,4,3500,Brown,420,True,small
2,Tofu,8,4000,Albino,390,False,small
3,Fifi,10,6000,Black-White,360,True,big
4,Spike,2,3000,Grey,410,True,small


In [262]:
# new column which tells if hedgehog is old or not using true and false
df['old'] = df['age'] >= 5

In [263]:
# dropping/deleting the ears column
df.drop(columns = 'ears')

Unnamed: 0,name,age,spikes,color,weight,friendly,old
0,Peanut,1,3000,Brown,350,True,False
1,Brownie,4,3500,Brown,420,True,False
2,Tofu,8,4000,Albino,390,False,True
3,Fifi,10,6000,Black-White,360,True,True
4,Spike,2,3000,Grey,410,True,False


## 6. grouping and aggregation

In [265]:
# grouping by color
df.groupby('color').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,spikes,spikes,spikes,spikes,spikes,weight,weight,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Albino,1.0,8.0,,8.0,8.0,8.0,8.0,8.0,1.0,4000.0,...,4000.0,4000.0,1.0,390.0,,390.0,390.0,390.0,390.0,390.0
Black-White,1.0,10.0,,10.0,10.0,10.0,10.0,10.0,1.0,6000.0,...,6000.0,6000.0,1.0,360.0,,360.0,360.0,360.0,360.0,360.0
Brown,2.0,2.5,2.12132,1.0,1.75,2.5,3.25,4.0,2.0,3250.0,...,3375.0,3500.0,2.0,385.0,49.497475,350.0,367.5,385.0,402.5,420.0
Grey,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,1.0,3000.0,...,3000.0,3000.0,1.0,410.0,,410.0,410.0,410.0,410.0,410.0


In [266]:
# grouping by color and aggregating
df.groupby('color').agg({
    'age': 'mean',         # average age per color
    'spikes': 'sum',       # total spikes per color
    'weight': 'mean',      # average weight per color
    'name': 'count'        # counting names per color
})

Unnamed: 0_level_0,age,spikes,weight,name
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Albino,8.0,4000,390.0,1
Black-White,10.0,6000,360.0,1
Brown,2.5,6500,385.0,2
Grey,2.0,3000,410.0,1


## 7. conditions

In [268]:
# with conditions we can find things in the data
df.loc[(df['age'] > 1) & (df['color'] =='Brown')]

Unnamed: 0,name,age,spikes,color,weight,friendly,ears,old
1,Brownie,4,3500,Brown,420,True,small,False


## 8. handling of missing data

In [270]:
# info shows non-null count and more
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      5 non-null      object
 1   age       5 non-null      int64 
 2   spikes    5 non-null      int64 
 3   color     5 non-null      object
 4   weight    5 non-null      int64 
 5   friendly  5 non-null      bool  
 6   ears      5 non-null      object
 7   old       5 non-null      bool  
dtypes: bool(2), int64(3), object(3)
memory usage: 382.0+ bytes


In [271]:
# if there is NaN values it says True
df.isna()

Unnamed: 0,name,age,spikes,color,weight,friendly,ears,old
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False


In [272]:
# drop all rows with NaN values
df.dropna()

Unnamed: 0,name,age,spikes,color,weight,friendly,ears,old
0,Peanut,1,3000,Brown,350,True,big,False
1,Brownie,4,3500,Brown,420,True,small,False
2,Tofu,8,4000,Albino,390,False,small,True
3,Fifi,10,6000,Black-White,360,True,big,True
4,Spike,2,3000,Grey,410,True,small,False


In [273]:
# if we dont want to drop those values we can put there datas mean value
agemean = df.age.mean()
agemean

df['age'] = df['age'].fillna(agemean)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      5 non-null      object
 1   age       5 non-null      int64 
 2   spikes    5 non-null      int64 
 3   color     5 non-null      object
 4   weight    5 non-null      int64 
 5   friendly  5 non-null      bool  
 6   ears      5 non-null      object
 7   old       5 non-null      bool  
dtypes: bool(2), int64(3), object(3)
memory usage: 382.0+ bytes


In [300]:
# finally you can transpose your dataframe
df.head().transpose()

Unnamed: 0,0,1,2,3,4
name,Peanut,Brownie,Tofu,Fifi,Spike
age,1,4,8,10,2
spikes,3000,3500,4000,6000,3000
color,Brown,Brown,Albino,Black-White,Grey
weight,350,420,390,360,410
friendly,True,True,False,True,True
ears,big,small,small,big,small
old,False,False,True,True,False


## 9. summary

In this notebook, we have covered the basics of the pandas library:  
- creating DataFrame objects 
- reading and saving data 
- basic operations 
- data manipulation 
- as well as grouping and aggregation. 