In [1]:
# !pip install pandas

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

###### series

In [3]:
print(pd.Series())
print(type(pd.Series()))

Series([], dtype: float64)
<class 'pandas.core.series.Series'>


In [4]:
s = pd.Series([1,2,3,4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [5]:
s = pd.Series([1.,2.,3.,4.])
s

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [6]:
s = pd.Series(['1.',2.,3.,4.])
s

0    1.
1     2
2     3
3     4
dtype: object

In [7]:
## creating series from list
l = list("pune")
l

['p', 'u', 'n', 'e']

In [8]:
s = pd.Series(data = l)
s

0    p
1    u
2    n
3    e
dtype: object

In [9]:
s = pd.Series(data = l, index = [1,2,3,4])
s

1    p
2    u
3    n
4    e
dtype: object

In [10]:
s = pd.Series(data = l, index = list('abcd'))
s

a    p
b    u
c    n
d    e
dtype: object

In [11]:
### series from iterable
s = pd.Series(data = range(1,6))
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [12]:
### series from an array
s = pd.Series(data = np.array(range(1,6)))
s

0    1
1    2
2    3
3    4
4    5
dtype: int32

In [13]:
### series from dictionary
s = pd.Series(data = {'a':5, 'b':7, 'c':90})
s

a     5
b     7
c    90
dtype: int64

In [14]:
s = pd.Series(data = {'a':5, 'b':7, 'c':90.5})
s

a     5.0
b     7.0
c    90.5
dtype: float64

In [15]:
### series from dictionary
s = pd.Series(data = {'a':[5,55], 'b':[7,77], 'c':90})
s

a    [5, 55]
b    [7, 77]
c         90
dtype: object

In [16]:
### series from dictionary
s = pd.Series(data = {'a':[5], 'b':[7,77], 'c':'90'})
s

a        [5]
b    [7, 77]
c         90
dtype: object

In [17]:
### series from dictionary
s = pd.Series(data = {'a':[5], 'b':[7,77], 'c':'90', 'a':[55]})
s

a       [55]
b    [7, 77]
c         90
dtype: object

In [18]:
### series from scalar
s = pd.Series(data = 5 )
s

0    5
dtype: int64

In [19]:
### series from scalar
s = pd.Series(data = 5, index=list('01234') )
s

0    5
1    5
2    5
3    5
4    5
dtype: int64

###### Access elements of a series

In [20]:
### series 
s = pd.Series(data = range(1,11) )
s

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [21]:
## fetch 1st element
print(s[0])

# fetch last elemnt
print(s[9])

### fetch 4,5,6
print(s[3:6])
print(s[-7:-4])

1
10
3    4
4    5
5    6
dtype: int64
3    4
4    5
5    6
dtype: int64


In [22]:
### series 
s = pd.Series(data = range(1,11), index = list('abcdefghij') )
s

a     1
b     2
c     3
d     4
e     5
f     6
g     7
h     8
i     9
j    10
dtype: int64

In [23]:
## fetch 1st element
print(s[0])
print(s['a'])

1
1


In [24]:
# fetch last elemnt
print(s[9])
print(s['j'])

10
10


In [25]:
### fetch 4,5,6
print(s[3:6])
print(s[-7:-4]) # here end is exclusive
print(s['d':'f']) # here end is inclusive

d    4
e    5
f    6
dtype: int64
d    4
e    5
f    6
dtype: int64
d    4
e    5
f    6
dtype: int64


###### operations on series

In [26]:
s1 = pd.Series(data = range(1,11) )
s2 =pd.Series(data = range(11,21) )

In [27]:
s1*s2

0     11
1     24
2     39
3     56
4     75
5     96
6    119
7    144
8    171
9    200
dtype: int64

In [28]:
s1+s2

0    12
1    14
2    16
3    18
4    20
5    22
6    24
7    26
8    28
9    30
dtype: int64

In [29]:
s1+2

0     3
1     4
2     5
3     6
4     7
5     8
6     9
7    10
8    11
9    12
dtype: int64

In [30]:
# unique(), nunique(), sort(), mean(), max(), std()
# s1.kurtosis() 

###### Dataframe

In [31]:
print(pd.DataFrame())
print(type(pd.DataFrame()))

Empty DataFrame
Columns: []
Index: []
<class 'pandas.core.frame.DataFrame'>


In [32]:
df = pd.DataFrame(data = [1,2,3,4,5])
df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [33]:
df = pd.DataFrame(data = [1,2,3,4,5], columns=['col1'])
df

Unnamed: 0,col1
0,1
1,2
2,3
3,4
4,5


In [34]:
df = pd.DataFrame(data = [[1,2],[2,3],[4,3],[4,5]])
df

Unnamed: 0,0,1
0,1,2
1,2,3
2,4,3
3,4,5


In [35]:
# np.array([[1,2],[2,3],[4,3],[4,5]])
df = pd.DataFrame(data = [[1,2],[2,3],[4,3],[4,5]], columns=['col1','col2'])
df

Unnamed: 0,col1,col2
0,1,2
1,2,3
2,4,3
3,4,5


In [36]:
df = pd.DataFrame(data = [[1,2],[2,3],[4,3],[4,5]], columns=['col1','col2'], index = list('abcd'))
df

Unnamed: 0,col1,col2
a,1,2
b,2,3
c,4,3
d,4,5


In [37]:
## example of nested list of different length
df = pd.DataFrame(data = [[2,4,6],[4,2,3],[4,3,5],[4,5,7]], columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
0,2,4,6
1,4,2,3
2,4,3,5
3,4,5,7


In [38]:
# NaN => float; represents missing values 
# None => object; represents missing values
df = pd.DataFrame(data = [[2,4.9,'6'],[4,2,3],[4,3,5],[4,5]], columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
0,2,4.9,6.0
1,4,2.0,3.0
2,4,3.0,5.0
3,4,5.0,


In [39]:
df = pd.DataFrame(data = [[2,4,6],['4',2.9,3],[4,3,5],[4,5]], columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
0,2,4.0,6.0
1,4,2.9,3.0
2,4,3.0,5.0
3,4,5.0,


In [40]:
### using dictionary
df = pd.DataFrame(data ={'name':['A','B','C','D'],"age":[50,34,23,np.nan],"salary":[233,45,566,np.nan],"extra_info":[True,False, np.nan,True]})
df

Unnamed: 0,name,age,salary,extra_info
0,A,50.0,233.0,True
1,B,34.0,45.0,False
2,C,23.0,566.0,
3,D,,,True


In [41]:
### using array
df = pd.DataFrame(data =np.array([1,2,3,4]))
df

Unnamed: 0,0
0,1
1,2
2,3
3,4


In [42]:
### using Series
df = pd.DataFrame(data = pd.Series([1,2,3,4]), columns=['col1'])
df

Unnamed: 0,col1
0,1
1,2
2,3
3,4


In [43]:
## list of dictionary
### using array
df = pd.DataFrame(data = [{'a':4,"b":40,"c":50}, {'a':4,"c":50}, {"b":40,"c":50}])
df

Unnamed: 0,a,b,c
0,4.0,40.0,50
1,4.0,,50
2,,40.0,50


In [44]:
## list of dictionary
### using array
df = pd.DataFrame(data = [{'a':[4,2],"b":40,"c":50}])
df

Unnamed: 0,a,b,c
0,"[4, 2]",40,50


In [45]:
## list of dictionary
### using array
df = pd.DataFrame(data = [{'a':[4,2],"b":40,"c":50}])
df

Unnamed: 0,a,b,c
0,"[4, 2]",40,50


In [46]:
## reading data from external sources
# pd.read_***

### write back to the sources
# df.to_***

In [47]:
#### 
df = pd.DataFrame(data = [{'a':4,"b":40,"c":50}, {'a':4,"c":50}, {"b":40,"c":50}, {'a':50,"d":50}])
df

Unnamed: 0,a,b,c,d
0,4.0,40.0,50.0,
1,4.0,,50.0,
2,,40.0,50.0,
3,50.0,,,50.0


In [48]:
#### extract single column 
print(type(df.a))
df.a

<class 'pandas.core.series.Series'>


0     4.0
1     4.0
2     NaN
3    50.0
Name: a, dtype: float64

In [49]:
df['a'] # series

0     4.0
1     4.0
2     NaN
3    50.0
Name: a, dtype: float64

In [50]:
df[['a']] # dataframe

Unnamed: 0,a
0,4.0
1,4.0
2,
3,50.0


In [51]:
#### extract multiple columns
df[['a','b']]

Unnamed: 0,a,b
0,4.0,40.0
1,4.0,
2,,40.0
3,50.0,


###### add a new column to existing dataframe

In [52]:
# syntax
# df['<col_name>'] = init

# add a column 'd'
df['d'] = 0 # broadcasting
df

Unnamed: 0,a,b,c,d
0,4.0,40.0,50.0,0
1,4.0,,50.0,0
2,,40.0,50.0,0
3,50.0,,,0


In [53]:
# df/3
## store diff between col 'a' and 'b'
df['diff'] = df.a - df.b
df

Unnamed: 0,a,b,c,d,diff
0,4.0,40.0,50.0,0,-36.0
1,4.0,,50.0,0,
2,,40.0,50.0,0,
3,50.0,,,0,


In [54]:
df['diff'] = 6
df

Unnamed: 0,a,b,c,d,diff
0,4.0,40.0,50.0,0,6
1,4.0,,50.0,0,6
2,,40.0,50.0,0,6
3,50.0,,,0,6


In [55]:
## adding a new row 
df = df.append({'a':500, "b":300,"c":200,"d":100}, ignore_index  = True)
df

Unnamed: 0,a,b,c,d,diff
0,4.0,40.0,50.0,0.0,6.0
1,4.0,,50.0,0.0,6.0
2,,40.0,50.0,0.0,6.0
3,50.0,,,0.0,6.0
4,500.0,300.0,200.0,100.0,


In [56]:
# dtype(), unique(), nunique()