Pandas Tutorials

In [94]:
import pandas as pd

In [95]:
print(pd.__version__)

2.2.3


Series Create, Manipulate, Querry, Delete

In [96]:
#creating a series from a list
arr = [0, 1, 2, 3, 4]
s1 = pd.Series(arr)
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [97]:
order = [1, 2, 3, 4, 5]
s2 = pd.Series(arr, index=order)
s2

1    0
2    1
3    2
4    3
5    4
dtype: int64

In [98]:
import numpy as np
n=np.random.randn(5)
index = ['a', 'b', 'c', 'd', 'e']
s2 = pd.Series(n, index=index)
s2

a    1.245672
b    1.006898
c    0.041202
d   -1.454457
e    0.564311
dtype: float64

In [99]:
#creating a series from dictionary
d = {'a':1,'b':2,'c':3,'d':4,'e':5}
s3 = pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [100]:
#modify the index
print(s1)
s1.index = ['A', 'B', 'C', 'D', 'E']
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64


A    0
B    1
C    2
D    3
E    4
dtype: int64

In [101]:
#slicing
s1[:3]

A    0
B    1
C    2
dtype: int64

In [102]:
#no append

In [103]:
#drop

Operations

In [104]:
arr1 = [0, 1, 2, 3, 4, 5, 7]
arr2 = [6, 7, 8, 9, 5]

In [105]:
s4 = pd.Series(arr2)
s4

0    6
1    7
2    8
3    9
4    5
dtype: int64

In [106]:
s5 = pd.Series(arr1)
s5

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

In [107]:
s4.add(s5)

0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64

In [108]:
s4.sub(s5)

0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64

In [109]:
s4.mul(s5)

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

In [110]:
s4.div(s5)

0     inf
1    7.00
2    4.00
3    3.00
4    1.25
5     NaN
6     NaN
dtype: float64

In [111]:
print('median', s5.median())
print('max', s5.max())
print('min', s5.min())

median 3.0
max 7
min 0


Create Dataframe

In [112]:
dates = pd.date_range('today', periods=6) #define time sequence as index
num_arr = np.random.randn(6,4) #import numpy random array (rows,columns)
columns = ['A', 'B', 'C', 'D'] #use the table as the column name

df1 = pd.DataFrame(num_arr, index = dates, columns = columns)
df1

Unnamed: 0,A,B,C,D
2024-12-06 16:03:15.003083,-0.349924,0.763562,-0.866598,0.879967
2024-12-07 16:03:15.003083,-0.201652,-0.708583,0.899596,0.629232
2024-12-08 16:03:15.003083,0.42372,-0.995813,0.913386,0.418397
2024-12-09 16:03:15.003083,0.183523,0.029774,2.285969,1.550082
2024-12-10 16:03:15.003083,2.040157,0.075114,0.586959,-0.077486
2024-12-11 16:03:15.003083,-0.897894,-0.177872,-1.459364,0.605844


In [113]:
#create dataframe with dictionary array

data = {'animals': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7,3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df2 = pd.DataFrame(data, index = labels)
df2

Unnamed: 0,animals,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [114]:
#see datatypes of array
df2.dtypes

animals      object
age         float64
visits        int64
priority     object
dtype: object

In [115]:
df3 = df2.head(6)
df3

Unnamed: 0,animals,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no


In [116]:
df2.tail(3)

Unnamed: 0,animals,age,visits,priority
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [117]:
print(df2.index)
df2.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')


Index(['animals', 'age', 'visits', 'priority'], dtype='object')

In [118]:
df2.values

array([['cat', 2.5, 1, 'yes'],
       ['cat', 3.0, 3, 'yes'],
       ['snake', 0.5, 2, 'no'],
       ['dog', nan, 3, 'yes'],
       ['dog', 5.0, 2, 'no'],
       ['cat', 2.0, 3, 'no'],
       ['snake', 4.5, 1, 'no'],
       ['cat', nan, 1, 'yes'],
       ['dog', 7.0, 2, 'no'],
       ['dog', 3.0, 1, 'no']], dtype=object)

In [119]:
df2.describe()   #see statistical data of dataframe

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [120]:
df2.T

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
animals,cat,cat,snake,dog,dog,cat,snake,cat,dog,dog
age,2.5,3.0,0.5,,5.0,2.0,4.5,,7.0,3.0
visits,1,3,2,3,2,3,1,1,2,1
priority,yes,yes,no,yes,no,no,no,yes,no,no


In [121]:
df2.sort_values(by = 'age')

Unnamed: 0,animals,age,visits,priority
c,snake,0.5,2,no
f,cat,2.0,3,no
a,cat,2.5,1,yes
b,cat,3.0,3,yes
j,dog,3.0,1,no
g,snake,4.5,1,no
e,dog,5.0,2,no
i,dog,7.0,2,no
d,dog,,3,yes
h,cat,,1,yes


In [125]:
#slicing dataframe
df2[1:3]

Unnamed: 0,animals,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [127]:
#query dataframe by tag
df2[['age', 'visits']]

Unnamed: 0,age,visits
a,2.5,1
b,3.0,3
c,0.5,2
d,,3
e,5.0,2
f,2.0,3
g,4.5,1
h,,1
i,7.0,2
j,3.0,1


In [129]:
df2.iloc[1:3]   #query rows 2:3

Unnamed: 0,animals,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [131]:
df3 = df2.copy()
df3

Unnamed: 0,animals,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [132]:
df3.isnull()

Unnamed: 0,animals,age,visits,priority
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,False,True,False,False
e,False,False,False,False
f,False,False,False,False
g,False,False,False,False
h,False,True,False,False
i,False,False,False,False
j,False,False,False,False


In [133]:
df3.loc['f', 'age'] = 1.5
df3

Unnamed: 0,animals,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [135]:
df3[['age']].mean()

age    3.375
dtype: float64

In [137]:
df3.sum()

animals     catcatsnakedogdogcatsnakecatdogdog
age                                       27.0
visits                                      19
priority              yesyesnoyesnononoyesnono
dtype: object

In [141]:
string = pd.Series(['A', 'C', 'D', 'Aaa', 'BaCa', np.nan, 'CBA', 'cow', 'owl'])
string.str.upper()

0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object

Operation for DataFrame missing values

In [145]:
df4 = df3.copy()
df4.fillna(4)

Unnamed: 0,animals,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,4.0,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,4.0,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [149]:
df5 = df3.copy()
df5.dropna(how = 'any')

Unnamed: 0,animals,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
i,dog,7.0,2,no
j,dog,3.0,1,no


Dataframe File operations

In [150]:
df3.to_csv('animal.csv')

In [152]:
df_animal = pd.read_csv('animal.csv')
df_animal.head(3)

Unnamed: 0.1,Unnamed: 0,animals,age,visits,priority
0,a,cat,2.5,1,yes
1,b,cat,3.0,3,yes
2,c,snake,0.5,2,no


Visualizaiton

In [160]:
#series and dataframe kine chart
import numpy as np

ts = pd.Series(np.random.randn(50), index = pd.date_range('today', periods = 50))
ts = ts.cumsum()
ts.plot()              #error

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [163]:
df = pd.DataFrame(np.random.randn(50,4), index = ts.index,
                  columns = ['A', 'B', 'X', 'Y'])
df = df.cumsum()
df.plot()             #error

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [165]:
#to remove repeated data, " .shift() != "