In [1]:
import pandas
pandas.__version__

'1.0.5'

## Pandas objects

In [2]:
import numpy as np
import pandas as pd

### series

In [3]:
count = pd.Series([100,200,300,400])
count

0    100
1    200
2    300
3    400
dtype: int64

In [4]:
count.values

array([100, 200, 300, 400], dtype=int64)

In [5]:
count.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
organs = pd.Series([100,200,300,400],index = ['Kidney','Lungs','Heart','Liver'])
organs

Kidney    100
Lungs     200
Heart     300
Liver     400
dtype: int64

In [7]:
organs['Liver']

400

In [8]:
organs[3]

400

In [9]:
organs.name = 'cost'
organs.index.name = 'Organs'
organs

Organs
Kidney    100
Lungs     200
Heart     300
Liver     400
Name: cost, dtype: int64

In [10]:
organs[organs>200]

Organs
Heart    300
Liver    400
Name: cost, dtype: int64

In [11]:
organs_dict = {'Kidney':100,'Lungs':200,'Heart':300,'Liver':400}
print(organs_dict)
pd.Series(organs_dict)

{'Kidney': 100, 'Lungs': 200, 'Heart': 300, 'Liver': 400}


Kidney    100
Lungs     200
Heart     300
Liver     400
dtype: int64

### Data Frame

In [12]:
Data = {"Name":["John","Mark","Jean","Bruce"],
        "Id.No":[112,242,356,156],
        "Percentage":[65.23,82.17,93,52.3]}
print(Data)
Data = pd.DataFrame(Data)
Data

{'Name': ['John', 'Mark', 'Jean', 'Bruce'], 'Id.No': [112, 242, 356, 156], 'Percentage': [65.23, 82.17, 93, 52.3]}


Unnamed: 0,Name,Id.No,Percentage
0,John,112,65.23
1,Mark,242,82.17
2,Jean,356,93.0
3,Bruce,156,52.3


In [62]:
df = pd.DataFrame(Data, columns=["Id.No","Name","Percentage"])
df

Unnamed: 0,Id.No,Name,Percentage
0,112,John,65.23
1,242,Mark,82.17
2,356,Jean,93.0
3,156,Bruce,52.3


In [63]:
df['Cgpa'] = (df.Percentage / 10) + 0.75
df

Unnamed: 0,Id.No,Name,Percentage,Cgpa
0,112,John,65.23,7.273
1,242,Mark,82.17,8.967
2,356,Jean,93.0,10.05
3,156,Bruce,52.3,5.98


In [64]:
df['someting'] = df.Percentage + df.Cgpa
df

Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting
0,112,John,65.23,7.273,72.503
1,242,Mark,82.17,8.967,91.137
2,356,Jean,93.0,10.05,103.05
3,156,Bruce,52.3,5.98,58.28


In [65]:
df['Serial_align'] = pd.Series(range(6),index = [0,1,2,3,4,5])
df

Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting,Serial_align
0,112,John,65.23,7.273,72.503,0
1,242,Mark,82.17,8.967,91.137,1
2,356,Jean,93.0,10.05,103.05,2
3,156,Bruce,52.3,5.98,58.28,3


In [66]:
df.to_dict()

{'Id.No': {0: 112, 1: 242, 2: 356, 3: 156},
 'Name': {0: 'John', 1: 'Mark', 2: 'Jean', 3: 'Bruce'},
 'Percentage': {0: 65.23, 1: 82.17, 2: 93.0, 3: 52.3},
 'Cgpa': {0: 7.273000000000001, 1: 8.967, 2: 10.05, 3: 5.9799999999999995},
 'someting': {0: 72.503, 1: 91.137, 2: 103.05, 3: 58.279999999999994},
 'Serial_align': {0: 0, 1: 1, 2: 2, 3: 3}}

In [67]:
pd.DataFrame(df.to_dict())

Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting,Serial_align
0,112,John,65.23,7.273,72.503,0
1,242,Mark,82.17,8.967,91.137,1
2,356,Jean,93.0,10.05,103.05,2
3,156,Bruce,52.3,5.98,58.28,3


In [68]:
data = [{'a':i , 'b': 10 * i}for i in range(6)]
print(data)
pd.DataFrame(data)

[{'a': 0, 'b': 0}, {'a': 1, 'b': 10}, {'a': 2, 'b': 20}, {'a': 3, 'b': 30}, {'a': 4, 'b': 40}, {'a': 5, 'b': 50}]


Unnamed: 0,a,b
0,0,0
1,1,10
2,2,20
3,3,30
4,4,40
5,5,50


In [58]:
#pd.DataFrame([{'aa':1,'bb':2},{'bb':3,'cc':6}])
#to_dict()

### from a two-dimensional NumPy Array

In [21]:
pd.DataFrame(np.random.randint(2,12),
             columns = ['foo','bar'],
             index = ['a','b','c'])

Unnamed: 0,foo,bar
a,5,5
b,5,5
c,5,5


In [23]:
ind = pd.Index([20,35,56,6,2,7])
ind

Int64Index([20, 35, 56, 6, 2, 7], dtype='int64')

In [24]:
ind[1]

35

In [25]:
ind[::]

Int64Index([20, 35, 56, 6, 2, 7], dtype='int64')

In [26]:
print(ind.size,ind.shape,ind.ndim,ind.dtype)

6 (6,) 1 int64


In [28]:
##ind[1] = 0

## Operating on Data in Pandas

In [36]:
rang = np.random.RandomState(10)
ser = pd.Series(rang.randint(0,10,4))
ser

0    9
1    4
2    0
3    1
dtype: int32

In [31]:
pd.DataFrame(ser)

Unnamed: 0,0
0,8
1,5
2,5
3,7


In [35]:
dfr = pd.DataFrame(rang.randint(0,10,(5,4)),
                   columns = ['A','B','C','D'])
dfr

Unnamed: 0,A,B,C,D
0,5,3,1,0
1,8,7,5,8
2,1,2,8,3
3,1,3,7,2
4,7,2,0,0


In [37]:
np.exp(ser)

0    8103.083928
1      54.598150
2       1.000000
3       2.718282
dtype: float64

In [38]:
np.sin(dfr * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-0.7071068,0.707107,0.7071068,0.0
1,-2.449294e-16,-0.707107,-0.7071068,-2.449294e-16
2,0.7071068,1.0,-2.449294e-16,0.7071068
3,0.7071068,0.707107,-0.7071068,1.0
4,-0.7071068,1.0,0.0,0.0


## Universal  functions: index Alignment

In [39]:
area = pd.Series({'Alaska':1756981,'Texas':695865,'California':425691}, name='area')
population = pd.Series({'California': 38332521 , 'Texas':26448193,'Newfork': 19651127},name='population')
print(area)
population

Alaska        1756981
Texas          695865
California     425691
Name: area, dtype: int64


California    38332521
Texas         26448193
Newfork       19651127
Name: population, dtype: int64

In [40]:
population / area

Alaska              NaN
California    90.047760
Newfork             NaN
Texas         38.007649
dtype: float64

In [41]:
area.index | population.index 

Index(['Alaska', 'California', 'Newfork', 'Texas'], dtype='object')

In [44]:
A = pd.Series([2,4,6], index = [0,1,2])
B = pd.Series([1,3,5], index = [1,2,3])
print(A)
print(B)
A + B

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [45]:
A.add(B ,fill_value=1)

0    3.0
1    5.0
2    9.0
3    6.0
dtype: float64

## Data wrangling

In [69]:
df

Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting,Serial_align
0,112,John,65.23,7.273,72.503,0
1,242,Mark,82.17,8.967,91.137,1
2,356,Jean,93.0,10.05,103.05,2
3,156,Bruce,52.3,5.98,58.28,3


In [70]:
df=pd.DataFrame(df)
df

Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting,Serial_align
0,112,John,65.23,7.273,72.503,0
1,242,Mark,82.17,8.967,91.137,1
2,356,Jean,93.0,10.05,103.05,2
3,156,Bruce,52.3,5.98,58.28,3


In [75]:
df1 = pd.DataFrame({"Name": ["Mark", "Jean", "John" , "Bruce"],
                    "Population": ["10000", "20000", "300000" , "200100"]})
df1

Unnamed: 0,Name,Population
0,Mark,10000
1,Jean,20000
2,John,300000
3,Bruce,200100


In [76]:
df.merge(df1)

Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting,Serial_align,Population
0,112,John,65.23,7.273,72.503,0,300000
1,242,Mark,82.17,8.967,91.137,1,10000
2,356,Jean,93.0,10.05,103.05,2,20000
3,156,Bruce,52.3,5.98,58.28,3,200100


In [77]:
df2 = pd.DataFrame({"name": ["Mark", "Jean", "John" , "Bruce"],
                   "Population": ["10000", "20000", "300000" , "200100"]})
df2
df.merge(df2, right_on='name' , left_on='Name')

Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting,Serial_align,name,Population
0,112,John,65.23,7.273,72.503,0,John,300000
1,242,Mark,82.17,8.967,91.137,1,Mark,10000
2,356,Jean,93.0,10.05,103.05,2,Jean,20000
3,156,Bruce,52.3,5.98,58.28,3,Bruce,200100


In [82]:
df4 = pd.DataFrame({"Name": ["Mark", "Jean", "John" , "Bruce" , "Mike"] , 
                   "Population": ["10000", "20000", "300000" , "200100","301300"]})
print(df)
df.merge(df4 , how='outer')

   Id.No   Name  Percentage    Cgpa  someting  Serial_align
0    112   John       65.23   7.273    72.503             0
1    242   Mark       82.17   8.967    91.137             1
2    356   Jean       93.00  10.050   103.050             2
3    156  Bruce       52.30   5.980    58.280             3


Unnamed: 0,Id.No,Name,Percentage,Cgpa,someting,Serial_align,Population
0,112.0,John,65.23,7.273,72.503,0.0,300000
1,242.0,Mark,82.17,8.967,91.137,1.0,10000
2,356.0,Jean,93.0,10.05,103.05,2.0,20000
3,156.0,Bruce,52.3,5.98,58.28,3.0,200100
4,,Mike,,,,,301300


### Combining data with data overlap

In [88]:
serie_c = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
                   index=['f','e','d','c','b','a'])
serie_e = pd.Series(np.arange(len(serie_a), dtype=np.float64),
                 index=['f', 'e', 'd', 'c', 'b', 'a'])

In [89]:
serie_c

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [90]:
serie_e

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [91]:
pd.Series(np.where(pd.isnull(serie_c), serie_e, serie_c), index=serie_c.index)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

In [92]:
serie_c.combine_first(serie_e)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64