# Pandas

In [1]:
import numpy as np
import pandas as pd

## Series
Similar to numpy array. Difference between Numpy array and Series, is that a Series can have axis labels, meaning it can be indexed by a label, instead of just number location. It also doesn't need to hold numeric data, it can hold any arbitrary Python Object.

### Creating a Series

In [2]:
labels = ['a','b','c','d']
my_list = [10,20,30,40]
arr = np.array([10,20,30,40])
d = {'a':10,'b':20,'c':30,'d':40}

#### 1. Using Lists

In [3]:
pd.Series(data = my_list)

0    10
1    20
2    30
3    40
dtype: int64

In [4]:
pd.Series(data=my_list, index=labels)

a    10
b    20
c    30
d    40
dtype: int64

In [6]:
pd.Series(my_list,labels)

a    10
b    20
c    30
d    40
dtype: int64

#### 2. Numpy Arrays

In [7]:
pd.Series(arr)

0    10
1    20
2    30
3    40
dtype: int64

In [8]:
pd.Series(arr,labels)

a    10
b    20
c    30
d    40
dtype: int64

#### 3. Dictionaries

In [9]:
pd.Series(d)

a    10
b    20
c    30
d    40
dtype: int64

#### Data in a Series

In [10]:
pd.Series(data=labels, index=my_list)

10    a
20    b
30    c
40    d
dtype: object

**Sereis can hold functions also.**

In [11]:
pd.Series([sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

### Using an Index.
The key to using Series is understanding its index. Pandas makes use of these index names or numbers by allowing for fast look ups of information

In [12]:
ser1 = pd.Series([1,2,3,4], index=['USA','Germany','USSR','Japan'])

In [13]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [14]:
ser2 = pd.Series([1,2,5,4], index=['USA','Germany','Italy','Japan'])

In [15]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [16]:
ser1['USA']

1

In [18]:
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

## DataFrames
DataFrames are bunch of Series objects put togather to share same index. 

In [19]:
from numpy.random import randn 
np.random.seed(101)

In [21]:
np.random.randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [24]:
df = pd.DataFrame(randn(5,4), index = 'A B C D E'.split(), columns = 'Sid Karan Kushal Anuraag'.split())

In [25]:
df

Unnamed: 0,Sid,Karan,Kushal,Anuraag
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


### Selection and Indexing

##### Selecting a column or columns

In [26]:
df['Sid']

A   -0.993263
B    1.025984
C    2.154846
D    0.147027
E   -0.925874
Name: Sid, dtype: float64

In [27]:
df[['Sid','Karan']]

Unnamed: 0,Sid,Karan
A,-0.993263,0.1968
B,1.025984,-0.156598
C,2.154846,-0.610259
D,0.147027,-0.479448
E,-0.925874,1.862864


In [28]:
df.Sid

A   -0.993263
B    1.025984
C    2.154846
D    0.147027
E   -0.925874
Name: Sid, dtype: float64

In [30]:
type(df.Sid) # Columns are just Series

pandas.core.series.Series

#### Creating a new column:

In [31]:
df['new'] = df['Sid'] + df['Karan']
df

Unnamed: 0,Sid,Karan,Kushal,Anuraag,new
A,-0.993263,0.1968,-1.136645,0.000366,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,1.544588
D,0.147027,-0.479448,0.558769,1.02481,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.93699


#### Removing Columns

In [32]:
df.drop('new', axis=1)

Unnamed: 0,Sid,Karan,Kushal,Anuraag
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [51]:
df # Original dataframe will not affected by this. Unless you use "inplace", it will not drop or change orginal dataframe.

Unnamed: 0,Sid,Karan,Kushal,Anuraag,new
A,-0.993263,0.1968,-1.136645,0.000366,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,1.544588
D,0.147027,-0.479448,0.558769,1.02481,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.93699


In [52]:
df.drop('new', axis=1, inplace=True)
df

Unnamed: 0,Sid,Karan,Kushal,Anuraag
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [35]:
df.sum(axis=1)

A   -2.729206
B    2.357019
C    1.987431
D    0.918737
E    1.350641
dtype: float64

#### Understing SUM function in Numpy and Pandas on axis. 
row = axis=0 sums vertically (down the columns)

Column = axis=1 sums horizontally (across the rows)

In [36]:
arr = np.array([[1,2,3],[4,5,6],[7,8,9]])
arr

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [47]:
data = pd.DataFrame(arr, index = 'a b c'.split(), columns='Sid Shubu Vedya'.split())
data

Unnamed: 0,Sid,Shubu,Vedya
a,1,2,3
b,4,5,6
c,7,8,9


In [48]:
print(arr.sum(axis=1))
print(data.sum(axis=1))

[ 6 15 24]
a     6
b    15
c    24
dtype: int64


In [49]:
print(arr.sum(axis=0))
print(data.sum(axis=0))

[12 15 18]
Sid      12
Shubu    15
Vedya    18
dtype: int64
