# Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
labels = ['a' , 'b' , 'c']
my_data = [1,2,4]
arr = np.array(my_data)
dic = {'a' : 12 , 'b' : 123 , 'c' : 43}

In [5]:
pd.Series(data = my_data , index = labels)

a    1
b    2
c    4
dtype: int64

In [7]:
pd.Series(arr , index = labels)

a    1
b    2
c    4
dtype: int32

In [8]:
pd.Series(data = dic)

a     12
b    123
c     43
dtype: int64

In [69]:
ser1 = pd.Series([1,2,3,4] , 
                 index = ['USA', 'India' , 'China' , 'USSR'])

In [70]:
ser2 = pd.Series([1,4,2,8] , 
                 index = ['USA', 'Italy' , 'Malta' , 'USSR'])

In [71]:
ser1

USA      1
India    2
China    3
USSR     4
dtype: int64

In [72]:
ser2

USA      1
Italy    4
Malta    2
USSR     8
dtype: int64

In [73]:
ser1['newly_added'] = 12

In [74]:
ser1

USA             1
India           2
China           3
USSR            4
newly_added    12
dtype: int64

In [41]:
# ways of indexing

print("Indexing by string",ser1['USA'])
print("Indexing by number",ser1[0])

Indexing by string 1
Indexing by number 1


In [42]:
# adding 2 series

ser1 + ser2

China     NaN
India     NaN
Italy     NaN
Malta     NaN
USA       2.0
USSR     12.0
dtype: float64

# DataFrames

<b>DataFrame</b> is basically collection of series with common or shared index

In [43]:
from numpy.random import randn
np.random.seed(101)

In [44]:
randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [45]:
df = pd.DataFrame(randn(3,3) ,
                  index = ['A' , 'B' , 'C'] , 
                  columns=['w','x','y'])

In [46]:
df

Unnamed: 0,w,x,y
A,0.302665,1.693723,-1.706086
B,-1.159119,-0.134841,0.390528
C,0.166905,0.184502,0.807706


In [59]:
type(df['w'])

pandas.core.series.Series

In [60]:
df['w']

A    0.302665
B   -1.159119
C    0.166905
Name: w, dtype: float64

In [61]:
df[['w','y']]

Unnamed: 0,w,y
A,0.302665,-1.706086
B,-1.159119,0.390528
C,0.166905,0.807706


In [64]:
# creation of new column

df['new'] = df['w'] + df['x']
df

Unnamed: 0,w,x,y,new
A,0.302665,1.693723,-1.706086,1.996388
B,-1.159119,-0.134841,0.390528,-1.29396
C,0.166905,0.184502,0.807706,0.351406


In [65]:
df.drop('new' , axis = 1)

Unnamed: 0,w,x,y
A,0.302665,1.693723,-1.706086
B,-1.159119,-0.134841,0.390528
C,0.166905,0.184502,0.807706


In [76]:
# selecting rows

print(df.loc['A'] , end = '\n\n')
print(df.iloc[1])

w      0.302665
x      1.693723
y     -1.706086
new    1.996388
Name: A, dtype: float64

w     -1.159119
x     -0.134841
y      0.390528
new   -1.293960
Name: B, dtype: float64


<b> Conditional selection </b>

In [80]:
print(df > 0 , end = '\n\n')
booldf = df > 0

print( df[booldf])

       w      x      y    new
A   True   True  False   True
B  False  False   True  False
C   True   True   True   True

          w         x         y       new
A  0.302665  1.693723       NaN  1.996388
B       NaN       NaN  0.390528       NaN
C  0.166905  0.184502  0.807706  0.351406


In [82]:
print(df , end = '\n\n')

print((df['w'] > 0) , end = '\n\n')

print(df[df['w'] > 0])

          w         x         y       new
A  0.302665  1.693723 -1.706086  1.996388
B -1.159119 -0.134841  0.390528 -1.293960
C  0.166905  0.184502  0.807706  0.351406

A     True
B    False
C     True
Name: w, dtype: bool

          w         x         y       new
A  0.302665  1.693723 -1.706086  1.996388
C  0.166905  0.184502  0.807706  0.351406


<b>Resetting index</b>

In [88]:
df

Unnamed: 0,w,x,y,new
A,0.302665,1.693723,-1.706086,1.996388
B,-1.159119,-0.134841,0.390528,-1.29396
C,0.166905,0.184502,0.807706,0.351406


In [91]:
df.reset_index(inplace=False)

Unnamed: 0,index,w,x,y,new
0,A,0.302665,1.693723,-1.706086,1.996388
1,B,-1.159119,-0.134841,0.390528,-1.29396
2,C,0.166905,0.184502,0.807706,0.351406


In [92]:
df['new_index'] = 'KL DL HR'.split()

In [93]:
df

Unnamed: 0,w,x,y,new,new_index
A,0.302665,1.693723,-1.706086,1.996388,KL
B,-1.159119,-0.134841,0.390528,-1.29396,DL
C,0.166905,0.184502,0.807706,0.351406,HR


In [96]:
df.set_index('new_index')

Unnamed: 0_level_0,w,x,y,new
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KL,0.302665,1.693723,-1.706086,1.996388
DL,-1.159119,-0.134841,0.390528,-1.29396
HR,0.166905,0.184502,0.807706,0.351406


<b>Missing Data</b>
<br><br>
* dropna<br>
* fillna


In [107]:
new_df = df.drop(columns=['new_index'])

In [109]:
demo = new_df[new_df > 0]

In [110]:
demo

Unnamed: 0,w,x,y,new
A,0.302665,1.693723,,1.996388
B,,,0.390528,
C,0.166905,0.184502,0.807706,0.351406


In [114]:
demo.dropna(thresh=2 , axis = 0)

Unnamed: 0,w,x,y,new
A,0.302665,1.693723,,1.996388
C,0.166905,0.184502,0.807706,0.351406


In [115]:
demo.fillna(value='FILL VAL')

Unnamed: 0,w,x,y,new
A,0.302665,1.69372,FILL VAL,1.99639
B,FILL VAL,FILL VAL,0.390528,FILL VAL
C,0.166905,0.184502,0.807706,0.351406


In [116]:
demo['x'].fillna(value = np.mean(demo['x']))

A    1.693723
B    0.939112
C    0.184502
Name: x, dtype: float64

<b>Group By</b>

In [118]:
places = ['ekm' , 'ekm' , 'ekm' , 'tvm' , 'tsr' , 'tvm' , 'tsr']
branch = ['mvpa' , 'klcy' , 'tpa' , 'skm' , 'clk' , 'ulr' , 'irg']
revenue = [10000 , 210000 , 189999 , 1288819 , 12399 , 123333 , 12223]

In [124]:
bank = pd.DataFrame({
    'place' : places , 'branch' : branch , 'revenue' : revenue
} )

In [125]:
bank

Unnamed: 0,place,branch,revenue
0,ekm,mvpa,10000
1,ekm,klcy,210000
2,ekm,tpa,189999
3,tvm,skm,1288819
4,tsr,clk,12399
5,tvm,ulr,123333
6,tsr,irg,12223


In [127]:
bank.groupby('place').sum()

Unnamed: 0_level_0,revenue
place,Unnamed: 1_level_1
ekm,409999
tsr,24622
tvm,1412152


In [130]:
bank.groupby('place').min()['revenue']

place
ekm     10000
tsr     12223
tvm    123333
Name: revenue, dtype: int64

In [131]:
bank.groupby('place').describe()

Unnamed: 0_level_0,revenue,revenue,revenue,revenue,revenue,revenue,revenue,revenue
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
place,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ekm,3.0,136666.333333,110151.168856,10000.0,99999.5,189999.0,199999.5,210000.0
tsr,2.0,12311.0,124.450793,12223.0,12267.0,12311.0,12355.0,12399.0
tvm,2.0,706076.0,824123.053978,123333.0,414704.5,706076.0,997447.5,1288819.0
