# Pandas Tutorial

In [1]:
import pandas as pd

In [3]:
# check pandas version
print(pd.__version__)

1.4.2


# Series create, manipulate, query, delete

# creating series from list


In [7]:
arr= [0,1,2,3,4]
s1= pd.Series(arr)
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [8]:
order = [1,2,3,4,5]
s2= pd.Series(arr, index=order)
s2

1    0
2    1
3    2
4    3
5    4
dtype: int64

In [10]:
import numpy as np
n = np.random.randn(5) # create a random Ndarray
index = ['a', 'b', 'c', 'd', 'e']
s2= pd.Series(n, index=index)
s2

a   -1.614596
b    1.457675
c   -0.736108
d   -0.645941
e   -0.874805
dtype: float64

# create a series from dictionary

In [11]:
d= {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
s3= pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

# modify index of series

In [15]:
print(s1)
s1.index= ['A', 'B', 'C', 'D', 'E']
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64


A    0
B    1
C    2
D    3
E    4
dtype: int64

# Slicing

In [16]:
a= s1[:3]
a

A    0
B    1
C    2
dtype: int64

In [17]:
s1[:3]

A    0
B    1
C    2
dtype: int64

In [18]:
s1[-1:]

E    4
dtype: int64

In [19]:
s1[-2:]

D    3
E    4
dtype: int64

In [20]:
s1[:-2]

A    0
B    1
C    2
dtype: int64

# append

In [24]:
s4= s1.append(s3)
s4

  s4= s1.append(s3)


A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64

In [27]:
conc = s1 + s3
s4= conc
print(s4)

A   NaN
B   NaN
C   NaN
D   NaN
E   NaN
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
dtype: float64


In [25]:
s4.drop('e')

A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
dtype: int64

# Series Operations

In [31]:
arr1= [0,1,2,3,4,5,7]
arr2= [6,7,8,9,5]

In [32]:
s5= pd.Series(arr2)
s5

0    6
1    7
2    8
3    9
4    5
dtype: int64

In [33]:
s6= pd.Series(arr1)
s6

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

In [34]:
s5.add(s6)

0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64

In [35]:
s5.sub(s6)

0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64

In [36]:
s5.mul(s6)

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

In [42]:
s7= s5.mul(s6)
s7

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

In [37]:
s5.div(s6)

0     inf
1    7.00
2    4.00
3    3.00
4    1.25
5     NaN
6     NaN
dtype: float64

In [38]:
s6.median()

3.0

In [43]:
print('median', s6.median())
print('max', s6.max())
print('min', s6.min())
print('mean', s6.mean())

median 3.0
max 7
min 0
mean 3.142857142857143


In [44]:
print('median', s7.median())
print('max', s7.max())
print('min', s7.min())
print('mean', s7.mean())

median 16.0
max 27.0
min 0.0
mean 14.0


# Create a DataFrame

In [45]:
dates= pd.date_range('today', periods=6) # Define time sequence as index
dates

DatetimeIndex(['2022-07-18 17:05:29.052280', '2022-07-19 17:05:29.052280',
               '2022-07-20 17:05:29.052280', '2022-07-21 17:05:29.052280',
               '2022-07-22 17:05:29.052280', '2022-07-23 17:05:29.052280'],
              dtype='datetime64[ns]', freq='D')

In [46]:
dates= pd.date_range('today', periods=6) # Define time sequence as index
num_arr= np.random.randn(6,4) # import numpy random array
num_arr

array([[ 1.97318255, -2.22864864, -0.37174908, -0.22795732],
       [-0.79503005, -0.77980433, -1.27081562,  0.08318943],
       [ 0.02137591, -1.93481991, -2.14285742, -0.33803443],
       [-0.07750342, -0.75619944, -0.96907712, -1.1199742 ],
       [ 1.39649526,  0.3576856 , -0.83819276,  0.57292513],
       [-1.77949371,  1.56059217, -0.17535965, -0.24757401]])

In [47]:
dates= pd.date_range('today', periods=6) # Define time sequence as index
num_arr= np.random.randn(6,4) # import numpy random array
columns= ['A', 'B', 'C', 'D'] # Use the table as the column name

df1= pd.DataFrame(num_arr, index=dates, columns= columns)
df1

Unnamed: 0,A,B,C,D
2022-07-18 17:10:17.204565,-2.506841,-0.570637,-0.550342,-0.033707
2022-07-19 17:10:17.204565,-2.227835,0.365916,-0.235825,0.188487
2022-07-20 17:10:17.204565,1.089369,-0.422498,-1.214198,1.41233
2022-07-21 17:10:17.204565,-1.716967,-0.730973,0.374758,-0.449474
2022-07-22 17:10:17.204565,2.334039,0.266783,0.727128,-0.48926
2022-07-23 17:10:17.204565,-0.674848,-1.247275,-0.506664,0.435208


In [53]:
# create a dataframe with dictionary array

data= {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
      'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7,3],
      'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
      'priority':['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no','no']}
labels= ['a', 'b', 'c', 'd', 'e', 'f', 'g',  'h', 'i', 'j']

df2= pd.DataFrame(data, index=labels)
df2

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [54]:
# see datatypes
df2.dtypes

animal       object
age         float64
visits        int64
priority     object
dtype: object

In [55]:
df2.head()

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no


In [56]:
df2.head(6)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no


In [58]:
df2.tail()

Unnamed: 0,animal,age,visits,priority
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [59]:
df2.tail(3)

Unnamed: 0,animal,age,visits,priority
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [61]:
print(df2.index)
df2.columns

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')


Index(['animal', 'age', 'visits', 'priority'], dtype='object')

In [62]:
df2.values

array([['cat', 2.5, 1, 'yes'],
       ['cat', 3.0, 3, 'yes'],
       ['snake', 0.5, 2, 'no'],
       ['dog', nan, 3, 'yes'],
       ['dog', 5.0, 2, 'no'],
       ['cat', 2.0, 3, 'no'],
       ['snake', 4.5, 1, 'no'],
       ['cat', nan, 1, 'yes'],
       ['dog', 7.0, 2, 'no'],
       ['dog', 3.0, 1, 'no']], dtype=object)

# see statistical data of dataframe

In [69]:
df2= pd.DataFrame(data, index=labels)
df2

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [63]:
df2.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [64]:
# Transpose
df2.T

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
animal,cat,cat,snake,dog,dog,cat,snake,cat,dog,dog
age,2.5,3.0,0.5,,5.0,2.0,4.5,,7.0,3.0
visits,1,3,2,3,2,3,1,1,2,1
priority,yes,yes,no,yes,no,no,no,yes,no,no


In [65]:
df2.sort_values(by='age')

Unnamed: 0,animal,age,visits,priority
c,snake,0.5,2,no
f,cat,2.0,3,no
a,cat,2.5,1,yes
b,cat,3.0,3,yes
j,dog,3.0,1,no
g,snake,4.5,1,no
e,dog,5.0,2,no
i,dog,7.0,2,no
d,dog,,3,yes
h,cat,,1,yes


In [67]:
# sclice dataframe
df2[1:3]

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [70]:
df2.sort_values(by='age')[1:3]

Unnamed: 0,animal,age,visits,priority
f,cat,2.0,3,no
a,cat,2.5,1,yes


In [71]:
# query dataframe by tag
df2[['age', 'visits']]

Unnamed: 0,age,visits
a,2.5,1
b,3.0,3
c,0.5,2
d,,3
e,5.0,2
f,2.0,3
g,4.5,1
h,,1
i,7.0,2
j,3.0,1


In [72]:
# Query rows 2,3
df2.iloc[1:3]

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [75]:
df3= df2.copy()
df3

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [76]:
df3.isnull()

Unnamed: 0,animal,age,visits,priority
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,False,True,False,False
e,False,False,False,False
f,False,False,False,False
g,False,False,False,False
h,False,True,False,False
i,False,False,False,False
j,False,False,False,False


In [77]:
# modify location
df3.loc['f', 'age']=1.5
df3

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [78]:
df3.mean()

  df3.mean()


age       3.375
visits    1.900
dtype: float64

In [79]:
df3[['age', 'visits']].mean()

age       3.375
visits    1.900
dtype: float64

In [80]:
df3['visits'].sum()

19

In [81]:
df3['visits'].max()

3

In [83]:
df3.sum()

animal      catcatsnakedogdogcatsnakecatdogdog
age                                       27.0
visits                                      19
priority              yesyesnoyesnononoyesnono
dtype: object

In [84]:
string= pd.Series(['A', 'C', 'D', 'Aaa', 'BaCa', np.nan, 'CBA', 'cow', 'owl'])
string

0       A
1       C
2       D
3     Aaa
4    BaCa
5     NaN
6     CBA
7     cow
8     owl
dtype: object

In [85]:
string= pd.Series(['A', 'C', 'D', 'Aaa', 'BaCa', np.nan, 'CBA', 'cow', 'owl'])
string.str.lower()

0       a
1       c
2       d
3     aaa
4    baca
5     NaN
6     cba
7     cow
8     owl
dtype: object

In [86]:
string= pd.Series(['A', 'C', 'D', 'Aaa', 'BaCa', np.nan, 'CBA', 'cow', 'owl'])
string.str.upper()

0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object

# Operations for DataFrame missing values

In [88]:
df4= df3.copy()
df4

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [90]:
df4.fillna(4)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,4.0,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,4.0,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [91]:
meanAge= df4['age'].mean()
df4['age'].fillna(meanAge)

a    2.500
b    3.000
c    0.500
d    3.375
e    5.000
f    1.500
g    4.500
h    3.375
i    7.000
j    3.000
Name: age, dtype: float64

In [93]:
df5= df3.copy()
df5

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [94]:
df5.dropna(how='any')

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
e,dog,5.0,2,no
f,cat,1.5,3,no
g,snake,4.5,1,no
i,dog,7.0,2,no
j,dog,3.0,1,no


# DataFrame file operations

In [95]:
df3.to_csv('animal.csv')

In [96]:
df_animal= pd.read_csv('animal.csv')
df_animal.head(3)

Unnamed: 0.1,Unnamed: 0,animal,age,visits,priority
0,a,cat,2.5,1,yes
1,b,cat,3.0,3,yes
2,c,snake,0.5,2,no


In [97]:
df3.to_excel('animal.xlsx', sheet_name='Sheet1')
df_animal2=pd.read_excel('animal.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
df_animal2

Unnamed: 0.1,Unnamed: 0,animal,age,visits,priority
0,a,cat,2.5,1,yes
1,b,cat,3.0,3,yes
2,c,snake,0.5,2,no
3,d,dog,,3,yes
4,e,dog,5.0,2,no
5,f,cat,1.5,3,no
6,g,snake,4.5,1,no
7,h,cat,,1,yes
8,i,dog,7.0,2,no
9,j,dog,3.0,1,no
