Three datastructures
    1. Series
    2. Data frame
    3. Index

### Series

In [6]:
import pandas as pd


In [10]:
ser = pd.Series([4,7,-5,3])
print(ser)

print(ser.ndim)
print(ser.shape)

0    4
1    7
2   -5
3    3
dtype: int64
1
(4,)


The first col is the System defined Index. It is single dimensional structure.

In [14]:
print('The values of Series: \n',ser.values)
print('The Keys of Series: \n',ser.index)

The values of Series: 
 [ 4  7 -5  3]
The Keys of Series: 
 RangeIndex(start=0, stop=4, step=1)


In [None]:
#Create our own indexes
# in Numpy's nd arrays, we have system generated indexes, while in Pandas series we have that feature

In [23]:
ser1 = pd.Series([4,7,-5,3], index=['Bob','Joe','Will','Sam'])

print('Values:',ser1.values)
print('Indexes:',ser1.index)

print('Value at Bob:',ser1['Bob'])
print('Value at Bob:',ser1[0]) # we can also use system generated indexes as well

print('Value at multiple selections:\n',ser1[['Bob','Joe','Will']])

Values: [ 4  7 -5  3]
Indexes: Index(['Bob', 'Joe', 'Will', 'Sam'], dtype='object')
Value at Bob: 4
Value at Bob: 4
Value at multiple selections:
 Bob     4
Joe     7
Will   -5
dtype: int64


In [40]:
print('---Pandas also do Vectorization by Broadcasting---')
print('Fetch all positive values',ser1[ser1>0])

print('\n---Keyword: \'in\'---')
print('Series:\n',ser1)
print("Is bob in Series ? :", 'Bob' in ser1)

print('\n---Fetch Keys like Dictionary---')
print("Keys in Series:\n",ser1.keys())
print("Keys in Series:\n",ser1.index)

---Pandas also do Vectorization by Broadcasting---
Fetch all positive values Bob    4
Joe    7
Sam    3
dtype: int64

---Keyword: 'in'---
Series:
 Bob     4
Joe     7
Will   -5
Sam     3
dtype: int64
Is bob in Series ? : True

---Fetch Keys and Values like Dictionary---
Keys in Series:
 Index(['Bob', 'Joe', 'Will', 'Sam'], dtype='object')
Keys in Series:
 Index(['Bob', 'Joe', 'Will', 'Sam'], dtype='object')


In [42]:
list(ser1.items())

[('Bob', 4), ('Joe', 7), ('Will', -5), ('Sam', 3)]

In [45]:
# Dictionary to Series

sdata = {'a':10,'b':20,'c':30,'d':40}

print("Types of sdata: ",type(sdata))

ser3 = pd.Series(sdata)
print("Series:",ser3)
print("Type of series",type(ser3))

Types of sdata:  <class 'dict'>
Series: a    10
b    20
c    30
d    40
dtype: int64
Type of series <class 'pandas.core.series.Series'>


In [51]:
dct = {'A':3,'B':4,'C':8,'D':10}
# dct['A':'C'] 
# It gives Error

# We can slice it in Pandas
ser3 = pd.Series(dct)
print(ser3['A':'C'])
print(ser3[0:3])

A    3
B    4
C    8
dtype: int64
A    3
B    4
C    8
dtype: int64


In [58]:
dct = {'A':3,'B':4,'C':8,'D':10}

states = ['N','A','B','C']

ser4 = pd.Series(dct,index=states)

print('Original Dictionary\n',dct)
print('New Series\n',ser4)
print('Above, as \'N\' is new state added, it has now corresponding value, It will store as NaN')

Original Dictionary
 {'A': 3, 'B': 4, 'C': 8, 'D': 10}
New Series
 N    NaN
A    3.0
B    4.0
C    8.0
dtype: float64
Above, as 'N' is new state added, it has now corresponding value, It will store as NaN


In [62]:
# ser4.index[1] = 'New'
# It gives error. It cannot be updated like that.

ser4.index = ['A','New','C','D']

print(ser4)

A      NaN
New    3.0
C      4.0
D      8.0
dtype: float64


### Data frames

In [65]:
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],'year':[2000,2001,2002,2001,2002],'pop':[1.5,1.7,3.6,2.4,2.9]}

frame = pd.DataFrame(data)
print(frame)

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9


In [67]:
print(frame.index)  # Row index
print(frame.columns) # Column Index

RangeIndex(start=0, stop=5, step=1)
Index(['state', 'year', 'pop'], dtype='object')


In [77]:
frame2 = pd.DataFrame(data, columns = ['year','pop','state','debt'],index=['one','two','three','four','five'])
print(frame2)
print(frame2['state']) #Recommended
print(frame2.state)

       year  pop   state debt
one    2000  1.5    Ohio  NaN
two    2001  1.7    Ohio  NaN
three  2002  3.6    Ohio  NaN
four   2001  2.4  Nevada  NaN
five   2002  2.9  Nevada  NaN
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object


In [79]:
print(frame2['pop'])
print(frame2.pop) # pop is a keyword. It gives output which is not expected. So to avoid, this syntax is not recommended.

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
Name: pop, dtype: float64
<bound method NDFrame.pop of        year  pop   state debt
one    2000  1.5    Ohio  NaN
two    2001  1.7    Ohio  NaN
three  2002  3.6    Ohio  NaN
four   2001  2.4  Nevada  NaN
five   2002  2.9  Nevada  NaN>


In [85]:
print(frame2)
print('\n',frame2.loc['three']) #use loc[] for user defined Row index
print('\n',frame2.iloc[3]) # Use iloc[] for System generated Row index

       year  pop   state debt
one    2000  1.5    Ohio  NaN
two    2001  1.7    Ohio  NaN
three  2002  3.6    Ohio  NaN
four   2001  2.4  Nevada  NaN
five   2002  2.9  Nevada  NaN

 year     2002
pop       3.6
state    Ohio
debt      NaN
Name: three, dtype: object

 year       2001
pop         2.4
state    Nevada
debt        NaN
Name: four, dtype: object

 one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64


In [96]:
print('\n',frame2.iloc[:,0])
print('\n',frame2.iloc[0:2,0:2])
print('\n',frame2.iloc[[0,2],[0,2]])
print('\n',frame2.iloc[:,-1])


 one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

      year  pop
one  2000  1.5
two  2001  1.7

        year state
one    2000  Ohio
three  2002  Ohio

 one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
Name: debt, dtype: object


### Index

In [97]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [7]:
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)),columns = list('bcd'),index = ['Ohio','Texas','Colarado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)),columns = list('bde'),index = ['Utah','Ohio','Texas','Oregon'])

df = df1 + df2 

print(df1)
print(df2)
print(df)
df

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colarado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
            b   c     d   e
Colarado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


Unnamed: 0,b,c,d,e
Colarado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [21]:
obj = pd.Series(range(4), index=['d','e','a','b'])
print("Sort by Index:\n",obj.sort_index())
print("Sort by Values:\n",obj.sort_values())



frame = pd.DataFrame(np.arange(8).reshape(2,4),index=['three','one'],columns=['d','a','c','b'])

print('Original:\n',frame)
print('Sort:\n',frame.sort_index())
print('Sort axis=1:\n',frame.sort_index(axis=1))
print('Sort axis=1:\n',frame.sort_index(axis=1,ascending = False))

print("Sort by Values:\n",frame.sort_values(by = 'b'))
print("Sort by multiple Values:\n",frame.sort_values(by = ['a','b']))


Sort by Index:
 a    2
b    3
d    0
e    1
dtype: int64
Sort by Values:
 d    0
e    1
a    2
b    3
dtype: int64
Original:
        d  a  c  b
three  0  1  2  3
one    4  5  6  7
Sort:
        d  a  c  b
one    4  5  6  7
three  0  1  2  3
Sort axis=1:
        a  b  c  d
three  1  3  2  0
one    5  7  6  4
Sort axis=1:
        d  c  b  a
three  0  2  3  1
one    4  6  7  5
Sort by Values:
        d  a  c  b
three  0  1  2  3
one    4  5  6  7
Sort by multiple Values:
        d  a  c  b
three  0  1  2  3
one    4  5  6  7


### Ranking

In [30]:
obj= pd.Series([7,-5,7,4,3,0,4])
print(obj.rank())
# By default it assigns ranks in Assigning order. If there are duplicates, It will take the average of both ranks. 
# We can change this method of taking Average. Use method attribute
print(obj.rank(method='first'))
# We can rank in descending order
print(obj.rank(ascending=False))

print(obj.rank(ascending=False,method='first'))
print(obj)
print(obj.rank(method='max'))
print(obj.rank(method='min'))

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64
0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64
0    7
1   -5
2    7
3    4
4    3
5    0
6    4
dtype: int64
0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64
0    6.0
1    1.0
2    6.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64


In [33]:
frame = pd.DataFrame({'b':[4.3,7,-3,2], 'a':[0,1,0,1],'c':[-2,5,8,-2.5]})

print("Original:\n",frame)
print("Rank row wise:\n",frame.rank(axis=0))
print("Rank col wise:\n",frame.rank(axis=1))

Original:
      b  a    c
0  4.3  0 -2.0
1  7.0  1  5.0
2 -3.0  0  8.0
3  2.0  1 -2.5
Rank row wise:
      b    a    c
0  3.0  1.5  2.0
1  4.0  3.5  3.0
2  1.0  1.5  4.0
3  2.0  3.5  1.0
Rank col wise:
      b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0


In [36]:
obj = pd.Series(range(3), index=['a','b','a'])
print(obj.index.is_unique)

False
