#  Pandas

Import pandas and check the version:

In [2]:
import pandas as pd
pd.__version__

'1.1.3'

# Introducing Pandas Objects

Import pandas and Numpy:

In [3]:
import numpy as np
import pandas as pd

### Series

A **Series** is a single vector of data with an *index* 

In [4]:
c= pd.Series([123,234,345,115])
c

0    123
1    234
2    345
3    115
dtype: int64

If an index is not specified, a default sequence of integers is assigned as the index. A NumPy array comprises the values of the `Series`, while the index is a pandas `Index` object.

In [5]:
c.values

array([123, 234, 345, 115], dtype=int64)

In [6]:
c.index

RangeIndex(start=0, stop=4, step=1)

We can assign meaningful labels to the index, if they are available:

In [13]:
fruitz=pd.Series([123,234,345,115],index=['Melon','Apple','Grapes','Watermelon'])
fruitz

Melon         123
Apple         234
Grapes        345
Watermelon    115
dtype: int64

We can use to refer to the values in the `Series` with these lables.

In [12]:
fruitz['Melon']

123

In [14]:
fruitz['Apple']

234

In [20]:
fruitz[3]

115

In [28]:
fruitz.name='All fruits'
fruitz.index.name='Fruit'
fruitz

Fruit
Melon         123
Apple         234
Grapes        345
Watermelon    115
Name: All fruits, dtype: int64

In [29]:
np.log(fruitz)

Fruit
Melon         4.812184
Apple         5.455321
Grapes        5.843544
Watermelon    4.744932
Name: All fruits, dtype: float64

In [31]:
fruitz[fruitz>200]

Fruit
Apple     234
Grapes    345
Name: All fruits, dtype: int64

A `Series` can be thought of as an ordered key-value store.

In [35]:
fruitz_dct={'Apple':12,'Melon':123,'Watermelon':2345,'Dragonfruit':234}
print(fruitz_dct)
pd.Series(fruitz_dct)

{'Apple': 12, 'Melon': 123, 'Watermelon': 2345, 'Dragonfruit': 234}


Apple            12
Melon           123
Watermelon     2345
Dragonfruit     234
dtype: int64

In [43]:
record={'Rahul':123,'Abhi':234,'Akash':345,'Avni':3456,'Tani':890,'Luca':678}
print(record)
pd.Series(record)

{'Rahul': 123, 'Abhi': 234, 'Akash': 345, 'Avni': 3456, 'Tani': 890, 'Luca': 678}


Rahul     123
Abhi      234
Akash     345
Avni     3456
Tani      890
Luca      678
dtype: int64

In [42]:
record={1001:{'Rahul':123,'Abhi':234,'Akash':345,'Avni':3456,'Tani':890,'Luca':678},
       'Avg_Marks':{'Maths':34,'Science':93,'Hindi':45,'English':46,'Social Science':89,'PE':78}}
print(record)
pd.Series(record)

{1001: {'Rahul': 123, 'Abhi': 234, 'Akash': 345, 'Avni': 3456, 'Tani': 890, 'Luca': 678}, 'Avg_Marks': {'Maths': 34, 'Science': 93, 'Hindi': 45, 'English': 46, 'Social Science': 89, 'PE': 78}}


1001         {'Rahul': 123, 'Abhi': 234, 'Akash': 345, 'Avn...
Avg_Marks    {'Maths': 34, 'Science': 93, 'Hindi': 45, 'Eng...
dtype: object

## DataFrame: bi-dimensional Series with two (or more) indices

A DataFrame represents a tabular, spreadsheet-like data structure containing an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.)

In [142]:
data={"Subjects":["Math","Science","Econ","English","Hindi"],
      "Avg_Marks":[56,89,45,90,86],
      "Year":[2004,2006,2003,2000,2001]}
print(data)
data=pd.DataFrame(data)
data

{'Subjects': ['Math', 'Science', 'Econ', 'English', 'Hindi'], 'Avg_Marks': [56, 89, 45, 90, 86], 'Year': [2004, 2006, 2003, 2000, 2001]}


Unnamed: 0,Subjects,Avg_Marks,Year
0,Math,56,2004
1,Science,89,2006
2,Econ,45,2003
3,English,90,2000
4,Hindi,86,2001


To change the order of the columns:

In [143]:
df=pd.DataFrame(data, columns=["Year","Subjects","Avg_Marks"])
df

Unnamed: 0,Year,Subjects,Avg_Marks
0,2004,Math,56
1,2006,Science,89
2,2003,Econ,45
3,2000,English,90
4,2001,Hindi,86


In [52]:
#df=pd.DataFrame(data, columns=["Year","Subjects","Avg_Marks","Total"])
#df

#	Year	Subjects	Avg_Marks	Total
#0	2004	Math	56	NaN
#1	2006	Science	89	NaN
#2	2003	Econ	45	NaN
#3	2000	English	90	NaN
#4	2001	Hindi	86	NaN

Assigning values to new columns

In [124]:
df['yr_marks']=df.Year//df.Avg_Marks
df

Unnamed: 0,Year,Subjects,Avg_Marks,yr_marks
0,2004,Math,56,35
1,2006,Science,89,22
2,2003,Econ,45,44
3,2000,English,90,22
4,2001,Hindi,86,23


In [125]:
df['Serie_align']=pd.Series(range(5),index=[0,1,2,3,4])
df

Unnamed: 0,Year,Subjects,Avg_Marks,yr_marks,Serie_align
0,2004,Math,56,35,0
1,2006,Science,89,22,1
2,2003,Econ,45,44,2
3,2000,English,90,22,3
4,2001,Hindi,86,23,4


converting dataframe to dictionary:

In [61]:
df.to_dict()

{'Year': {0: 2004, 1: 2006, 2: 2003, 3: 2000, 4: 2001},
 'Subjects': {0: 'Math', 1: 'Science', 2: 'Econ', 3: 'English', 4: 'Hindi'},
 'Avg_Marks': {0: 56, 1: 89, 2: 45, 3: 90, 4: 86},
 'yr_marks': {0: 35, 1: 22, 2: 44, 3: 22, 4: 23},
 'Serie_align': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}}

In [138]:
#getting back to dataframe from dictionary
pd.DataFrame(df.to_dict())

Unnamed: 0,Year,Subjects,Avg_Marks,yr_marks,Serie_align
0,2004,Math,56,35,0
1,2006,Science,89,22,1
2,2003,Econ,45,44,2
3,2000,English,90,22,3
4,2001,Hindi,86,23,4


### DataFrame as specialized dictionary

#### From a list of dicts

Any list of dictionaries can be made into a ``DataFrame``.

In [64]:
d=[{'e':w,'c':2*w}for w in range(6)]
print(d)
pd.DataFrame(d)

[{'e': 0, 'c': 0}, {'e': 1, 'c': 2}, {'e': 2, 'c': 4}, {'e': 3, 'c': 6}, {'e': 4, 'c': 8}, {'e': 5, 'c': 10}]


Unnamed: 0,e,c
0,0,0
1,1,2
2,2,4
3,3,6
4,4,8
5,5,10


In [65]:
pd.DataFrame([{'ww':3,'rr':2},{'rr':4,'tt':7}])

Unnamed: 0,ww,rr,tt
0,3.0,2,
1,,4,7.0


In [66]:
df=pd.DataFrame([{'ww':3,'rr':2},{'rr':4,'tt':7}])
df=df.to_dict()
df

{'ww': {0: 3.0, 1: nan}, 'rr': {0: 2, 1: 4}, 'tt': {0: nan, 1: 7.0}}

#### From a two-dimensional NumPy array

In [77]:
pd.DataFrame(np.random.randint(4,7),
            columns=['fur','bus'],
            index=['a','b','c'])

Unnamed: 0,fur,bus
a,5,5
b,5,5
c,5,5


## The Pandas Index Object

In [78]:
ind=pd.Index([12,23,34,45,1,2])
ind

Int64Index([12, 23, 34, 45, 1, 2], dtype='int64')

### Index as immutable array

In [79]:
ind[1]

23

In [80]:
ind[::]

Int64Index([12, 23, 34, 45, 1, 2], dtype='int64')

In [83]:
ind[1:7:2]

Int64Index([23, 45, 2], dtype='int64')

In [84]:
print(ind.size,ind.shape,ind.ndim,ind.dtype)

6 (6,) 1 int64


One difference between ``Index`` objects and NumPy arrays is that indices are immutable.

In [85]:
ind[1]=0 #as Index does not support mutable operations

TypeError: Index does not support mutable operations

# Operating on Data in Pandas

## Ufuncs: Index Preservation

In [88]:
r=np.random.RandomState(10)
ser=pd.Series(r.randint(0,11,2))
ser

0    9
1    4
dtype: int32

In [89]:
r=np.random.RandomState(10)
ser=pd.Series(r.randint(0,10,4))
ser

0    9
1    4
2    0
3    1
dtype: int32

In [93]:
dfr=pd.DataFrame(r.randint(0,10,(5,4)),columns=['a','b','c','d'])
dfr

Unnamed: 0,a,b,c,d
0,5,3,9,6
1,9,1,9,4
2,2,6,7,8
3,8,9,2,0
4,6,7,8,1


In [95]:
print(ser)
np.exp(ser)

0    9
1    4
2    0
3    1
dtype: int32


0    8103.083928
1      54.598150
2       1.000000
3       2.718282
dtype: float64

In [96]:
np.sin(dfr*np.pi/4)

Unnamed: 0,a,b,c,d
0,-0.7071068,0.707107,0.7071068,-1.0
1,0.7071068,0.707107,0.7071068,1.224647e-16
2,1.0,-1.0,-0.7071068,-2.449294e-16
3,-2.449294e-16,0.707107,1.0,0.0
4,-1.0,-0.707107,-2.449294e-16,0.7071068


## Universal Functions: Index Alignment

### Index alignment in Series

In [105]:
Car=pd.Series({'audi':123,'Tesla':234,'ford':345},name='Car')
Stock=pd.Series({'audi':2,'Tesla':10,'ford':3,'civic':3},name='Stock')

print(Car)
Stock

audi     123
Tesla    234
ford     345
Name: Car, dtype: int64


audi      2
Tesla    10
ford      3
civic     3
Name: Stock, dtype: int64

In [106]:
Stock/Car

Tesla    0.042735
audi     0.016260
civic         NaN
ford     0.008696
dtype: float64

In [107]:
Car/Stock

Tesla     23.4
audi      61.5
civic      NaN
ford     115.0
dtype: float64

In [108]:
Car.index|Stock.index

Index(['Tesla', 'audi', 'civic', 'ford'], dtype='object')

In [110]:
a=pd.Series([9,8,7],index=[0,1,2])
b=pd.Series([5,6,7], index=[3,2,1])
print(a)
print(b)
a+b

0    9
1    8
2    7
dtype: int64
3    5
2    6
1    7
dtype: int64


0     NaN
1    15.0
2    13.0
3     NaN
dtype: float64

In [111]:
a.add(b, fill_value=0)

0     9.0
1    15.0
2    13.0
3     5.0
dtype: float64

# Data wrangling
Getting the data in the shape that we want is the single most time consuming task in the life of the Data Scientist.

## Merge operations
By merging we mean combining different data sets by linking rows with one or more keys. The basic syntax is very simple.

In [151]:
df

Unnamed: 0,Year,Subjects,Avg_Marks
0,2004,Math,56
1,2006,Science,89
2,2003,Econ,45
3,2000,English,90
4,2001,Hindi,86


In [152]:
df2=pd.DataFrame({"Subjects":["Math","Science","English"],"Code":["12","34","3"]})
df2

Unnamed: 0,Subjects,Code
0,Math,12
1,Science,34
2,English,3


In [153]:
df.merge(df2)# merge is smart! If there are overlapping names, it uses those for the merge

Unnamed: 0,Year,Subjects,Avg_Marks,Code
0,2004,Math,56,12
1,2006,Science,89,34
2,2000,English,90,3


If the column names are different, you need to specify them explicitely

In [156]:
df3=pd.DataFrame({"Subjects":["Math","Science","English"],"Code":[12,65,90]})
df3
df2.merge(df3,right_on='Subjects',left_on='Subjects')

Unnamed: 0,Subjects,Code_x,Code_y
0,Math,12,12
1,Science,34,65
2,English,3,90


In [157]:
df4=pd.DataFrame({"Subjects":["Math","Science","Sanskrit"],"Code":["12","34","93"]})
df4
df.merge(df4,how='outer')

Unnamed: 0,Year,Subjects,Avg_Marks,Code
0,2004.0,Math,56.0,12.0
1,2006.0,Science,89.0,34.0
2,2003.0,Econ,45.0,
3,2000.0,English,90.0,
4,2001.0,Hindi,86.0,
5,,Sanskrit,,93.0


Check this out:

In [159]:
df5=pd.DataFrame({"Subjects":["Math","Science","Math"],"Code":["12","34","3"]})
print(df)
df.merge(df5,how='outer')

   Year Subjects  Avg_Marks
0  2004     Math         56
1  2006  Science         89
2  2003     Econ         45
3  2000  English         90
4  2001    Hindi         86


Unnamed: 0,Year,Subjects,Avg_Marks,Code
0,2004,Math,56,12.0
1,2004,Math,56,3.0
2,2006,Science,89,34.0
3,2003,Econ,45,
4,2000,English,90,
5,2001,Hindi,86,


### Combining data with overlap
Sometimes some data is missing, and it can be "patched" with another dataset.

In [173]:
s_a=pd.Series([np.nan,2.5,np.nan,3.5,np.nan,5.5],
                 index=['a','b','c','d','e','f'])
s_b=pd.Series(np.arange(len(s_a), dtype=np.float64),
                index=['a','b','c','d','e','f'])

In [174]:
s_a

a    NaN
b    2.5
c    NaN
d    3.5
e    NaN
f    5.5
dtype: float64

In [175]:
s_b

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
f    5.0
dtype: float64

In [176]:
pd.Series(np.where(pd.isnull(s_a),s_b,s_a),index=s_a.index)

a    0.0
b    2.5
c    2.0
d    3.5
e    4.0
f    5.5
dtype: float64

In [190]:
s_c=pd.Series([np.nan,"manny",np.nan,"fur",np.nan,"zico"],
                 index=['a','b','c','d','e','f'])
s_d=pd.Series(["nick","oobool","kia","abc","dfe","sdf"],
                index=['a','b','c','d','e','f'])

In [191]:
s_c

a      NaN
b    manny
c      NaN
d      fur
e      NaN
f     zico
dtype: object

In [192]:
s_d

a      nick
b    oobool
c       kia
d       abc
e       dfe
f       sdf
dtype: object

In [193]:
pd.Series(np.where(pd.isnull(s_c),s_d,s_c),index=s_c.index)

a     nick
b    manny
c      kia
d      fur
e      dfe
f     zico
dtype: object

That's a bit verbose for something so simple. What about this:

In [194]:
s_a.combine_first(s_b)

a    0.0
b    2.5
c    2.0
d    3.5
e    4.0
f    5.5
dtype: float64

In [195]:
s_c.combine_first(s_d)

a     nick
b    manny
c      kia
d      fur
e      dfe
f     zico
dtype: object