#### Iteration

In [2]:
import pandas as pd
import numpy as np

In [10]:
#Basic iteration (for i in object) produces −
# Series : values
# DataFrame : column labels

N=20
df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
   })

for col in df:
   print(col)

print("============")

# To iterate over the rows of the DataFrame, we can use the following functions −

# iteritems() − to iterate over the (key,value) pairs
# iterrows() − iterate over the rows as (index,series) pairs
# itertuples() − iterate over the rows as namedtuples

 
# iteritems : iterates over each column as key, value pair with label as key and column value as a Series object.
df = pd.DataFrame(np.random.randn(4,3),columns=['col1','col2','col3'])
for key,value in df.iteritems():
   print (key)
   print (value)
    
print("============")

# iterrows() :  iterator yielding each index value along with a series containing the data in each row.
for row_index,row in df.iterrows():
   print (row_index)
   print (row)

print("============")

# itertuples() : return an iterator yielding a named tuple for each row in the DataFrame. The first element of the tuple will be the row’s corresponding index value, while the remaining values are the row values.
for row in df.itertuples():
    print (row)
    
# Note − Do not try to modify any object while iterating. Iterating is meant for reading and the iterator returns a copy of the original object (a view), thus the changes will not reflect on the original object.

A
x
y
C
D
col1
0    1.174064
1    0.129846
2    2.165899
3   -0.942980
Name: col1, dtype: float64
col2
0   -0.422369
1    0.204788
2   -1.957589
3    0.596280
Name: col2, dtype: float64
col3
0    0.016368
1   -0.725480
2   -0.556309
3    0.041517
Name: col3, dtype: float64
0
col1    1.174064
col2   -0.422369
col3    0.016368
Name: 0, dtype: float64
1
col1    0.129846
col2    0.204788
col3   -0.725480
Name: 1, dtype: float64
2
col1    2.165899
col2   -1.957589
col3   -0.556309
Name: 2, dtype: float64
3
col1   -0.942980
col2    0.596280
col3    0.041517
Name: 3, dtype: float64
Pandas(Index=0, col1=1.174063651573622, col2=-0.4223688710109621, col3=0.01636825016123445)
Pandas(Index=1, col1=0.129846301717185, col2=0.2047884204244064, col3=-0.725480336893298)
Pandas(Index=2, col1=2.1658990918836243, col2=-1.9575891561324552, col3=-0.556309490290224)
Pandas(Index=3, col1=-0.9429797451191785, col2=0.5962803424149622, col3=0.04151705078192355)


#### Sorting

In [15]:
unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
print(unsorted_df)
print("============")

# by label
sorted_df=unsorted_df.sort_index()
print (sorted_df)
print("============")

sorted_df = unsorted_df.sort_index(ascending=False)
print (sorted_df)
print("============")

sorted_df=unsorted_df.sort_index(axis=1)
print (sorted_df)
print("============")

# by value
sorted_df = unsorted_df.sort_values(by='col1')
print (sorted_df)
print("============")

sorted_df = unsorted_df.sort_values(by=['col1','col2']) # sort by col1, then col2
print (sorted_df)
print("============")

       col2      col1
1 -0.790191  0.069278
4 -1.864903  1.097703
6 -1.983841 -0.750352
2  0.111709 -1.547924
3 -0.763949 -0.860989
5 -0.558386 -0.104880
9  0.018769  2.420888
8 -1.269009  0.085033
0 -0.524631 -0.707591
7  0.952823  0.726951
       col2      col1
0 -0.524631 -0.707591
1 -0.790191  0.069278
2  0.111709 -1.547924
3 -0.763949 -0.860989
4 -1.864903  1.097703
5 -0.558386 -0.104880
6 -1.983841 -0.750352
7  0.952823  0.726951
8 -1.269009  0.085033
9  0.018769  2.420888
       col2      col1
9  0.018769  2.420888
8 -1.269009  0.085033
7  0.952823  0.726951
6 -1.983841 -0.750352
5 -0.558386 -0.104880
4 -1.864903  1.097703
3 -0.763949 -0.860989
2  0.111709 -1.547924
1 -0.790191  0.069278
0 -0.524631 -0.707591
       col1      col2
1  0.069278 -0.790191
4  1.097703 -1.864903
6 -0.750352 -1.983841
2 -1.547924  0.111709
3 -0.860989 -0.763949
5 -0.104880 -0.558386
9  2.420888  0.018769
8  0.085033 -1.269009
0 -0.707591 -0.524631
7  0.726951  0.952823
       col2      col1
2  0.11170

#### Working with Text Data

In [31]:
s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])
print (s)
print("============")

print (s.str.lower()) 
print("============")
print (s.str.len())
print("============")
print (s.str.split(' '))
print("============")
print (s.str.cat(sep='_'))
print("============")
print (s.str.contains(' '))
print("============")
print (s.str.replace('@','$'))
print("============")
print (s.str.repeat(3))
print("============")
print (s.str.count('i')) # to count patter
print("============")
print (s.str. startswith ('T'))
print("============")
print (s.str.find('e'))
print("============")
print (s.str.findall('e'))
print("============")
print (s.str.isnumeric())

# upper, isUpper, isLower, swapCase, endswith, strip ( to remove whitespace or newline from both sides)

0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object
0             tom
1    william rick
2            john
3         alber@t
4             NaN
5            1234
6      stevesmith
dtype: object
0     3.0
1    12.0
2     4.0
3     7.0
4     NaN
5     4.0
6    10.0
dtype: float64
0              [Tom]
1    [William, Rick]
2             [John]
3          [Alber@t]
4                NaN
5             [1234]
6       [SteveSmith]
dtype: object
Tom_William Rick_John_Alber@t_1234_SteveSmith
0    False
1     True
2    False
3    False
4      NaN
5    False
6    False
dtype: object
0             Tom
1    William Rick
2            John
3         Alber$t
4             NaN
5            1234
6      SteveSmith
dtype: object
0                               TomTomTom
1    William RickWilliam RickWilliam Rick
2                            JohnJohnJohn
3                   Alber@tAlber@tAlber@t
4                            

In [33]:
# get_dummies() : Returns the DataFrame with One-Hot Encoded values.
s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])
print (s.str.get_dummies())

   1234  Alber@t  John  SteveSmith  Tom  William Rick
0     0        0     0           0    1             0
1     0        0     0           0    0             1
2     0        0     1           0    0             0
3     0        1     0           0    0             0
4     0        0     0           0    0             0
5     1        0     0           0    0             0
6     0        0     0           1    0             0


#### Options and Customization

In [38]:
# get_option(param)
print (pd.get_option("display.max_rows"))
print (pd.get_option("display.max_columns"))

# set_option(param,value)
pd.set_option("display.max_rows",80)
print (pd.get_option("display.max_rows"))

pd.set_option("display.max_columns",30)
print (pd.get_option("display.max_columns"))

# reset_option(param)
pd.reset_option("display.max_rows")
print (pd.get_option("display.max_rows"))

# describe_option(param)
pd.describe_option("display.max_rows")

# option_context() : set the option in with statement temporarily. Option values are restored automatically when you exit the with bloc
with pd.option_context("display.max_rows",10):
   print(pd.get_option("display.max_rows"))
print(pd.get_option("display.max_rows"))

# display.max_colwidth, display.precision ......

60
30
80
30
60
display.max_rows : int
    If max_rows is exceeded, switch to truncate view. Depending on
    `large_repr`, objects are either centrally truncated or printed as
    a summary view. 'None' value means unlimited.

    In case python/IPython is running in a terminal and `large_repr`
    equals 'truncate' this can be set to 0 and pandas will auto-detect
    the height of the terminal and print a truncated object which fits
    the screen height. The IPython notebook, IPython qtconsole, or
    IDLE do not run in a terminal and hence it is not possible to do
    correct auto-detection.
    [default: 60] [currently: 60]
10
60


#### Indexing and Selecting Data

In [44]:
# .loc()
# label based indexing. When slicing, the start bound is also included. Integers are valid labels, but they refer to the label and not the position.

#.loc() has multiple access methods like −
#- A single scalar label
#- A list of labels
#- A slice object
#- A Boolean array

df = pd.DataFrame(np.random.randn(8, 4), index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
print(df)
print("============")
#select all rows for a specific column
print (df.loc[:,'A'])
print("============")

# Select all rows for multiple columns, say list[]
print (df.loc[:,['A','C']])
print("============")

# Select few rows for multiple columns, say list[]
print (df.loc[['a','b','f','h'],['A','C']])
print("============")

# Select range of rows for all columns
print (df.loc['a':'h'])
print("============")

# for getting values with a boolean array
print (df.loc['a']>0)

          A         B         C         D
a -0.236807  0.937865 -2.888951  0.267290
b  1.069261 -0.723420  0.314174 -0.030322
c  0.602832  0.411329 -0.245838 -0.773086
d  0.481549  0.825248  0.079149  0.303322
e -1.404035 -0.471700 -0.100087  0.036535
f -0.677829  0.229500 -2.962171  1.199166
g  1.031731 -1.169315 -0.614924  0.183039
h  0.164384 -0.976441 -0.346737 -0.378387
a   -0.236807
b    1.069261
c    0.602832
d    0.481549
e   -1.404035
f   -0.677829
g    1.031731
h    0.164384
Name: A, dtype: float64
          A         C
a -0.236807 -2.888951
b  1.069261  0.314174
c  0.602832 -0.245838
d  0.481549  0.079149
e -1.404035 -0.100087
f -0.677829 -2.962171
g  1.031731 -0.614924
h  0.164384 -0.346737
          A         C
a -0.236807 -2.888951
b  1.069261  0.314174
f -0.677829 -2.962171
h  0.164384 -0.346737
          A         B         C         D
a -0.236807  0.937865 -2.888951  0.267290
b  1.069261 -0.723420  0.314174 -0.030322
c  0.602832  0.411329 -0.245838 -0.773086
d  0.48154

In [56]:
#.iloc()
# Pandas provide various methods in order to get purely integer based indexing. Like python and numpy, these are 0-based indexing.

# The various access methods are as follows −
# -An Integer
# -A list of integers
# -A range of values

# select first 3 rows
print (df.iloc[:3])
print("============")

# Integer slicing
print (df.iloc[:4])
print("============")
print (df.iloc[1:5, 2:4])
print("============")

# Slicing through list of values
print (df.iloc[[1, 3, 5], [1, 3]])

          A         B         C         D
a -0.236807  0.937865 -2.888951  0.267290
b  1.069261 -0.723420  0.314174 -0.030322
c  0.602832  0.411329 -0.245838 -0.773086
          A         B         C         D
a -0.236807  0.937865 -2.888951  0.267290
b  1.069261 -0.723420  0.314174 -0.030322
c  0.602832  0.411329 -0.245838 -0.773086
d  0.481549  0.825248  0.079149  0.303322
          C         D
b  0.314174 -0.030322
c -0.245838 -0.773086
d  0.079149  0.303322
e -0.100087  0.036535
          B         D
b -0.723420 -0.030322
d  0.825248  0.303322
f  0.229500  1.199166


In [57]:
# Use of Notations
print (df['A'])
print("============")
print (df[['A','B']])
print("============")
print (df[2:2])

a   -0.236807
b    1.069261
c    0.602832
d    0.481549
e   -1.404035
f   -0.677829
g    1.031731
h    0.164384
Name: A, dtype: float64
          A         B
a -0.236807  0.937865
b  1.069261 -0.723420
c  0.602832  0.411329
d  0.481549  0.825248
e -1.404035 -0.471700
f -0.677829  0.229500
g  1.031731 -1.169315
h  0.164384 -0.976441
Empty DataFrame
Columns: [A, B, C, D]
Index: []


In [58]:
# Attribute Access
# Columns can be selected using the attribute operator '.'.
print(df.A)

a   -0.236807
b    1.069261
c    0.602832
d    0.481549
e   -1.404035
f   -0.677829
g    1.031731
h    0.164384
Name: A, dtype: float64


#### Statistical Functions

In [66]:
# Percent_change
# pct_change(): This function compares every element with its prior element and computes the change percentage.

s = pd.Series([1,2,3,4,5,4])
print (s)
print (s.pct_change())
print("============")

df = pd.DataFrame(np.random.randn(5, 2))
print (df)
print (df.pct_change())
print("============")

# covariance : The Series object has a method cov to compute covariance between series objects.

s1 = pd.Series(np.random.randn(10))
s2 = pd.Series(np.random.randn(10))
print (s1.cov(s2))
print("============")

# Covariance method when applied on a DataFrame, computes cov between all the columns.

frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
print (frame['a'].cov(frame['b']))
print (frame.cov())
print("============")

# Correlation
# Correlation shows the linear relationship between any two array of values (series). 
print (frame['a'].corr(frame['b']))
print (frame.corr())
print("============")

# Data Ranking
# Data Ranking produces ranking for each element in the array of elements. In case of ties, assigns the mean rank.
# supports tie-breaking methods, specified with the method parameter : average, min, max, first
s = pd.Series(np.random.randn(5), index=list('abcde'))
s['d'] = s['b'] # so there's a tie
print (s.rank())

0    1
1    2
2    3
3    4
4    5
5    4
dtype: int64
0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.250000
5   -0.200000
dtype: float64
          0         1
0 -0.443506 -0.062079
1  0.582401  0.936544
2 -1.215541  1.050853
3 -0.394864  0.888063
4  0.150859 -0.682977
          0          1
0       NaN        NaN
1 -2.313174 -16.086402
2 -3.087123   0.122055
3 -0.675154  -0.154912
4 -1.382054  -1.769064
-0.018025207547768266
-0.14448079734769811
          a         b         c         d         e
a  1.671429 -0.144481 -0.178908 -0.659685 -0.472027
b -0.144481  0.923370  0.041094  0.348036  0.091978
c -0.178908  0.041094  0.546733  0.356987 -0.085573
d -0.659685  0.348036  0.356987  0.741090  0.134075
e -0.472027  0.091978 -0.085573  0.134075  1.517710
-0.1162995909239505
          a         b         c         d         e
a  1.000000 -0.116300 -0.187153 -0.592730 -0.296366
b -0.116300  1.000000  0.057836  0.420727  0.077696
c -0.187153  0.057836  1.000000  0.560827 -0.

#### Window Functions

Window functions are majorly used in finding the trends within the data graphically by smoothing the curve. If there is lot of variation in the everyday data and a lot of data points are available, then taking the samples and plotting is one method and applying the window computations and plotting the graph on the results is another method. By these methods, we can smooth the curve or the trend.

In [71]:
df = pd.DataFrame(np.random.randn(10, 4),
   index = pd.date_range('1/1/2000', periods=10),
   columns = ['A', 'B', 'C', 'D'])
print (df)
print("============")

# .rolling() Function
print (df.rolling(window=3).mean())
print("============")

# expanding 
print (df.expanding(min_periods=3).mean())
print("============")

# emw -> Need to check
print (df.ewm(com=0.5).mean()) 

                   A         B         C         D
2000-01-01  1.356288 -0.392409 -1.792914 -1.403462
2000-01-02  0.702072 -0.993849  1.352042  2.648213
2000-01-03  0.749066 -0.331487 -0.332236 -0.963730
2000-01-04  0.476884 -0.420764 -0.594480  0.397436
2000-01-05 -0.605740 -0.158157  0.393586 -1.119445
2000-01-06 -0.080415  1.162418  1.027936 -0.270399
2000-01-07 -0.577999  0.527071 -0.087764  2.175051
2000-01-08 -0.851448  0.761811  0.144894  0.365379
2000-01-09  0.672145  1.254644  2.043468  0.636084
2000-01-10 -0.596173  1.527416  0.056050  0.097289
                   A         B         C         D
2000-01-01       NaN       NaN       NaN       NaN
2000-01-02       NaN       NaN       NaN       NaN
2000-01-03  0.935809 -0.572582 -0.257703  0.093674
2000-01-04  0.642674 -0.582034  0.141775  0.693973
2000-01-05  0.206737 -0.303469 -0.177710 -0.561913
2000-01-06 -0.069757  0.194499  0.275681 -0.330803
2000-01-07 -0.421384  0.510444  0.444586  0.261736
2000-01-08 -0.503287  0.817100 

In [78]:
#### Aggregation

df = pd.DataFrame(np.random.randn(10, 4),
   index = pd.date_range('1/1/2000', periods=10),
   columns = ['A', 'B', 'C', 'D'])
print (df)
print("============")


r = df.rolling(window=3,min_periods=1)
print (r)
print("============")

print (r.aggregate(np.sum))
print("============")

print (r['A'].aggregate(np.sum))
print("============")

print (r[['A','B']].aggregate(np.sum))
print("============")

print (r['A'].aggregate([np.sum,np.mean]))
print("============")

print (r[['A','B']].aggregate([np.sum,np.mean]))
print("============")

print (r.aggregate({'A' : np.sum,'B' : np.mean}))
print("============")

                   A         B         C         D
2000-01-01 -0.770244  0.601120  1.099879 -0.853355
2000-01-02 -0.623771  0.900741  0.413312 -1.918746
2000-01-03  0.867965  0.462481 -1.467903 -1.383589
2000-01-04 -0.980524  0.734016  0.180587 -0.272521
2000-01-05  0.812611  1.699361 -1.147465  0.489294
2000-01-06  0.584713 -0.521599 -1.365379 -0.541559
2000-01-07 -0.006718 -0.058113 -0.486110  0.473967
2000-01-08 -1.166000  0.306662 -1.607847 -0.769249
2000-01-09  0.316361 -0.422530 -0.616206  0.041472
2000-01-10 -1.173398 -0.439290 -0.651852 -0.891006
Rolling [window=3,min_periods=1,center=False,axis=0]
                   A         B         C         D
2000-01-01 -0.770244  0.601120  1.099879 -0.853355
2000-01-02 -1.394014  1.501861  1.513192 -2.772101
2000-01-03 -0.526049  1.964342  0.045289 -4.155690
2000-01-04 -0.736329  2.097238 -0.874004 -3.574856
2000-01-05  0.700053  2.895858 -2.434781 -1.166816
2000-01-06  0.416800  1.911778 -2.332258 -0.324786
2000-01-07  1.390606  1.11964