In [None]:
# Shift function of pandas

In [3]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'DATE': [1, 2, 3, 4, 5],
                   'VOLUME': [100, 200, 300,400,500],
                   'PRICE': [214, 234, 253,272,291]})
print(df)

   DATE  PRICE  VOLUME
0     1    214     100
1     2    234     200
2     3    253     300
3     4    272     400
4     5    291     500


In [4]:
df.shift(1)

Unnamed: 0,DATE,PRICE,VOLUME
0,,,
1,1.0,214.0,100.0
2,2.0,234.0,200.0
3,3.0,253.0,300.0
4,4.0,272.0,400.0


In [10]:
df.shift(3,fill_value=100)

Unnamed: 0,DATE,PRICE,VOLUME
0,100,100,100
1,100,100,100
2,100,100,100
3,1,214,100
4,2,234,200


In [11]:
df['PREV_DAY_PRICE'] = df['PRICE'].shift(1,fill_value=0)
print(df)

   DATE  PRICE  VOLUME  PREV_DAY_PRICE
0     1    214     100               0
1     2    234     200             214
2     3    253     300             234
3     4    272     400             253
4     5    291     500             272


In [12]:
df['TOMORROW_PRICE'] = df['PRICE'].shift(-1,fill_value=0)

In [13]:
df

Unnamed: 0,DATE,PRICE,VOLUME,PREV_DAY_PRICE,TOMORROW_PRICE
0,1,214,100,0,234
1,2,234,200,214,253
2,3,253,300,234,272
3,4,272,400,253,291
4,5,291,500,272,0


In [15]:
# mask()
# The mask method is an application of the if-then condition for each element of a Series or DataFrame. 
# If Cond is True, then it uses the value from Other (default value is NaN) else would retain the original value. 
# This mask() method is quite similar to where().


In [16]:
df = pd.DataFrame(np.arange(15).reshape(-1, 3), columns=['A', 'B','C'])
print(df)

    A   B   C
0   0   1   2
1   3   4   5
2   6   7   8
3   9  10  11
4  12  13  14


In [17]:
# mask operation to check if element is divided by 2 without any remainder. If match change the sign of the element as original

df.mask(df % 2 == 0,-df)


Unnamed: 0,A,B,C
0,0,1,-2
1,3,-4,5
2,-6,7,-8
3,9,-10,11
4,-12,13,-14


In [20]:
# nlargest()
# On many occasions, we encounter situations where we need to find the top 3 or bottom five values for a Series or DataFrame 
# (like top-performing three students with their aggregate score or three bottom candidates with the total number of votes
# obtained in an election.
                                                                                                                            
import pandas as pd
import numpy as np
df = pd.DataFrame({'HEIGHT': [170,78,99,160,160,130,155,70,70,20],
                   'WEIGHT': [50,60,70,80,90,90,90,50,60,70]},
                   index=['A','B','C','D','E','F','G','H','I','J'])
print(df)

   HEIGHT  WEIGHT
A     170      50
B      78      60
C      99      70
D     160      80
E     160      90
F     130      90
G     155      90
H      70      50
I      70      60
J      20      70


In [21]:
dfl = df.nlargest(3,'HEIGHT')
print(dfl)

   HEIGHT  WEIGHT
A     170      50
D     160      80
E     160      90


In [22]:
dfl = df.nlargest(2,'HEIGHT',keep='all')
print(dfl)

   HEIGHT  WEIGHT
A     170      50
D     160      80
E     160      90


In [23]:
dfl = df.nlargest(2,'HEIGHT',keep='last')
print(dfl)

   HEIGHT  WEIGHT
A     170      50
E     160      90


In [24]:
dfl = df.nlargest(2,'HEIGHT',keep='first')
print(dfl)

   HEIGHT  WEIGHT
A     170      50
D     160      80


In [25]:
# nsmallest()
# nsmallest() also works similarly but with idiom smallest filter in mind. Refer to the example below, 
# where we now use weight to find two smallest weights.
import pandas as pd
import numpy as np
df = pd.DataFrame({'HEIGHT': [170,78,99,160,160,130,155,70,70,20],
                   'WEIGHT': [50,60,70,80,90,90,90,50,60,70]},
                   index=['A','B','C','D','E','F','G','H','I','J'])
print(df)

   HEIGHT  WEIGHT
A     170      50
B      78      60
C      99      70
D     160      80
E     160      90
F     130      90
G     155      90
H      70      50
I      70      60
J      20      70


In [27]:
dfs = df.nsmallest(3,'WEIGHT', keep='first')
print(dfs)

   HEIGHT  WEIGHT
A     170      50
H      70      50
B      78      60


In [28]:
dfs = df.nsmallest(3,'WEIGHT', keep='last')
print(dfs)

   HEIGHT  WEIGHT
H      70      50
A     170      50
I      70      60


In [29]:
dfs = df.nsmallest(3,'WEIGHT', keep='all')
print(dfs)

   HEIGHT  WEIGHT
A     170      50
H      70      50
B      78      60
I      70      60


In [30]:
# Cut function for binning
# Sometimes numerical values make more sense if clustered together. For example, if we’re trying to model traffic 
# (#cars on road) with time of the day (minutes). The exact minute of an hour might not be that relevant for predicting 
# traffic as compared to actual period of the day like “Morning”, “Afternoon”, “Evening”, “Night”, “Late Night”. 
# Modeling traffic this way will be more intuitive and will avoid overfitting

In [42]:
#Binning:
def binning(col, cut_points, labels=None):
  #Define min and max values:
  minval = col.min()
  maxval = col.max()

  #create list by adding min and max to cut_points
  break_points = [minval] + cut_points + [maxval]

  #if no labels provided, use default labels 0 ... (n-1)
  if not labels:
    labels = range(len(cut_points)+1)

  #Binning using cut function of pandas
  colBin = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)
  return colBin



In [44]:
pd.cut?  Bin values into discrete intervals.
# Use `cut` when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a 
# categorical variable

# For example, `cut` could convert ages to groups of age ranges. Supports binning into an equal number of bins, or a
# pre-specified array of bins.

In [43]:
#Binning age:
cut_points = [90,140,190]
labels = ["low","medium","high","very high"]
data = pd.DataFrame(data={'LoanAmount':[-1,-2,0,10,20,70,99,101,110,143,155,199,198,440,556]})
data["LoanAmount_Bin"] = binning(data["LoanAmount"], cut_points, labels)
print pd.value_counts(data["LoanAmount_Bin"], sort=False)

[-2, 90, 140, 190, 556]
low          6
medium       3
high         2
very high    4
Name: LoanAmount_Bin, dtype: int64
