# Functions not to forget

* Not frequently used but come in handy for some tasks


## Exercise 1 - clip function

* Trim values at input threshold(s)

In [1]:
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.randint(-10, 10, size=(5, 5)), columns=list("ABCDE"))

df

Unnamed: 0,A,B,C,D,E
0,-7,-7,7,-3,-5
1,5,-4,2,2,-5
2,0,-7,8,9,8
3,-6,7,-3,1,-8
4,5,-5,-9,-10,-2


In [2]:
df.clip(lower=-4)

Unnamed: 0,A,B,C,D,E
0,-4,-4,7,-3,-4
1,5,-4,2,2,-4
2,0,-4,8,9,8
3,-4,7,-3,1,-4
4,5,-4,-4,-4,-2


In [3]:
df.clip(lower=-4, upper=4)

Unnamed: 0,A,B,C,D,E
0,-4,-4,4,-3,-4
1,4,-4,2,2,-4
2,0,-4,4,4,4
3,-4,4,-3,1,-4
4,4,-4,-4,-4,-2


In [4]:
df

Unnamed: 0,A,B,C,D,E
0,-7,-7,7,-3,-5
1,5,-4,2,2,-5
2,0,-7,8,9,8
3,-6,7,-3,1,-8
4,5,-5,-9,-10,-2


In [5]:
df.clip(lower=-4, upper=4, inplace=True)

df

Unnamed: 0,A,B,C,D,E
0,-4,-4,4,-3,-4
1,4,-4,2,2,-4
2,0,-4,4,4,4
3,-4,4,-3,1,-4
4,4,-4,-4,-4,-2


## Exercise 2 - diff function

* Calculates the difference of a DataFrame element compared with another element in the DataFrame (default is element in previous row).

In [6]:
df = pd.DataFrame(np.random.randint(0, 10, size=(5, 5)), columns=list("ABCDE"))

df

Unnamed: 0,A,B,C,D,E
0,3,7,4,2,4
1,5,5,7,2,7
2,3,4,3,9,7
3,7,9,7,0,2
4,8,4,9,0,3


In [7]:
df.diff()

Unnamed: 0,A,B,C,D,E
0,,,,,
1,2.0,-2.0,3.0,0.0,3.0
2,-2.0,-1.0,-4.0,7.0,0.0
3,4.0,5.0,4.0,-9.0,-5.0
4,1.0,-5.0,2.0,0.0,1.0


In [8]:
df.diff(axis=1)

Unnamed: 0,A,B,C,D,E
0,,4,-3,-2,2
1,,0,2,-5,5
2,,1,-1,6,-2
3,,2,-2,-7,2
4,,-4,5,-9,3


## Exercise 3 - diff function

In [9]:
df.diff(periods=1)

Unnamed: 0,A,B,C,D,E
0,,,,,
1,2.0,-2.0,3.0,0.0,3.0
2,-2.0,-1.0,-4.0,7.0,0.0
3,4.0,5.0,4.0,-9.0,-5.0
4,1.0,-5.0,2.0,0.0,1.0


In [10]:
df.diff(periods=2)

Unnamed: 0,A,B,C,D,E
0,,,,,
1,,,,,
2,0.0,-3.0,-1.0,7.0,3.0
3,2.0,4.0,0.0,-2.0,-5.0
4,5.0,0.0,6.0,-9.0,-4.0


In [11]:
df.diff(periods=-1)

Unnamed: 0,A,B,C,D,E
0,-2.0,2.0,-3.0,0.0,-3.0
1,2.0,1.0,4.0,-7.0,0.0
2,-4.0,-5.0,-4.0,9.0,5.0
3,-1.0,5.0,-2.0,0.0,-1.0
4,,,,,


In [12]:
df

Unnamed: 0,A,B,C,D,E
0,3,7,4,2,4
1,5,5,7,2,7
2,3,4,3,9,7
3,7,9,7,0,2
4,8,4,9,0,3


## Exercise 4 - corr function

* Compute pairwise correlation of columns, excluding NA/null values.

In [13]:
churn = pd.read_csv(
    
    "Data/BankChurners.csv",
    usecols=["Customer_Age", "Total_Trans_Ct", "Total_Trans_Amt", "Credit_Limit"]

)

churn.head()

Unnamed: 0,Customer_Age,Credit_Limit,Total_Trans_Amt,Total_Trans_Ct
0,45,12691.0,1144,42
1,49,8256.0,1291,33
2,51,3418.0,1887,20
3,40,3313.0,1171,20
4,40,4716.0,816,28


In [14]:
churn[["Customer_Age", "Credit_Limit"]].corr()

Unnamed: 0,Customer_Age,Credit_Limit
Customer_Age,1.0,0.002476
Credit_Limit,0.002476,1.0


In [15]:
churn.corr()

Unnamed: 0,Customer_Age,Credit_Limit,Total_Trans_Amt,Total_Trans_Ct
Customer_Age,1.0,0.002476,-0.046446,-0.067097
Credit_Limit,0.002476,1.0,0.17173,0.075927
Total_Trans_Amt,-0.046446,0.17173,1.0,0.807192
Total_Trans_Ct,-0.067097,0.075927,0.807192,1.0


## Exercise 5 - transform function

In [16]:
import numpy as np

df = pd.DataFrame(np.random.randint(0, 10, size=(5, 3)), columns=list("ABC"))

df

Unnamed: 0,A,B,C
0,7,7,7
1,7,1,9
2,6,2,2
3,2,4,3
4,8,8,6


In [17]:
np.sqrt(df)

Unnamed: 0,A,B,C
0,2.645751,2.645751,2.645751
1,2.645751,1.0,3.0
2,2.44949,1.414214,1.414214
3,1.414214,2.0,1.732051
4,2.828427,2.828427,2.44949


In [18]:
df.transform(np.sqrt)

Unnamed: 0,A,B,C
0,2.645751,2.645751,2.645751
1,2.645751,1.0,3.0
2,2.44949,1.414214,1.414214
3,1.414214,2.0,1.732051
4,2.828427,2.828427,2.44949


In [19]:
df.transform([np.sqrt, np.square])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,sqrt,square,sqrt,square,sqrt,square
0,2.645751,49,2.645751,49,2.645751,49
1,2.645751,49,1.0,1,3.0,81
2,2.44949,36,1.414214,4,1.414214,4
3,1.414214,4,2.0,16,1.732051,9
4,2.828427,64,2.828427,64,2.44949,36


## Exercise 6 - transform function

In [20]:
df.transform(lambda x: np.sqrt(x) + 0.002)

Unnamed: 0,A,B,C
0,2.647751,2.647751,2.647751
1,2.647751,1.002,3.002
2,2.45149,1.416214,1.416214
3,1.416214,2.002,1.734051
4,2.830427,2.830427,2.45149


## Exercise 7 - get_dummies function

In [21]:
churn = pd.read_csv(
    
    "Data/BankChurners.csv",
    usecols=["Customer_Age", "Attrition_Flag", "Gender", "Credit_Limit"]

)

churn.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Credit_Limit
0,Existing Customer,45,M,12691.0
1,Existing Customer,49,F,8256.0
2,Existing Customer,51,M,3418.0
3,Existing Customer,40,F,3313.0
4,Existing Customer,40,M,4716.0


In [22]:
churn["Gender"].unique()

array(['M', 'F'], dtype=object)

In [23]:
pd.get_dummies(churn)

Unnamed: 0,Customer_Age,Credit_Limit,Attrition_Flag_Attrited Customer,Attrition_Flag_Existing Customer,Gender_F,Gender_M
0,45,12691.0,False,True,False,True
1,49,8256.0,False,True,True,False
2,51,3418.0,False,True,False,True
3,40,3313.0,False,True,True,False
4,40,4716.0,False,True,False,True
...,...,...,...,...,...,...
10122,50,4003.0,False,True,False,True
10123,41,4277.0,True,False,False,True
10124,44,5409.0,True,False,True,False
10125,30,5281.0,True,False,False,True


In [24]:
churn = pd.get_dummies(churn)

churn.head()

Unnamed: 0,Customer_Age,Credit_Limit,Attrition_Flag_Attrited Customer,Attrition_Flag_Existing Customer,Gender_F,Gender_M
0,45,12691.0,False,True,False,True
1,49,8256.0,False,True,True,False
2,51,3418.0,False,True,False,True
3,40,3313.0,False,True,True,False
4,40,4716.0,False,True,False,True


## Exercise 8 - get_dummies function

In [25]:
churn = pd.read_csv(
    
    "Data/BankChurners.csv",
    usecols=["Customer_Age", "Attrition_Flag", "Gender", "Credit_Limit"]

)

churn.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Credit_Limit
0,Existing Customer,45,M,12691.0
1,Existing Customer,49,F,8256.0
2,Existing Customer,51,M,3418.0
3,Existing Customer,40,F,3313.0
4,Existing Customer,40,M,4716.0


In [26]:
churn = pd.get_dummies(churn, drop_first=True)

churn.head()

Unnamed: 0,Customer_Age,Credit_Limit,Attrition_Flag_Existing Customer,Gender_M
0,45,12691.0,True,True
1,49,8256.0,True,False
2,51,3418.0,True,True
3,40,3313.0,True,False
4,40,4716.0,True,True


## Exercise 9 - from_dummies function

* New in version 1.5.0.

In [27]:
churn = pd.read_csv(
    
    "Data/BankChurners.csv",
    usecols=["Customer_Age", "Attrition_Flag", "Gender", "Credit_Limit"]

)

churn = pd.get_dummies(churn)

churn.head()

Unnamed: 0,Customer_Age,Credit_Limit,Attrition_Flag_Attrited Customer,Attrition_Flag_Existing Customer,Gender_F,Gender_M
0,45,12691.0,False,True,False,True
1,49,8256.0,False,True,True,False
2,51,3418.0,False,True,False,True
3,40,3313.0,False,True,True,False
4,40,4716.0,False,True,False,True


In [28]:
pd.from_dummies(churn[["Gender_F", "Gender_M"]])

Unnamed: 0,Unnamed: 1
0,Gender_M
1,Gender_F
2,Gender_M
3,Gender_F
4,Gender_M
...,...
10122,Gender_M
10123,Gender_M
10124,Gender_F
10125,Gender_M


In [29]:
churn["Gender"] = pd.from_dummies(churn[["Gender_F", "Gender_M"]])

churn.head()

Unnamed: 0,Customer_Age,Credit_Limit,Attrition_Flag_Attrited Customer,Attrition_Flag_Existing Customer,Gender_F,Gender_M,Gender
0,45,12691.0,False,True,False,True,Gender_M
1,49,8256.0,False,True,True,False,Gender_F
2,51,3418.0,False,True,False,True,Gender_M
3,40,3313.0,False,True,True,False,Gender_F
4,40,4716.0,False,True,False,True,Gender_M


## Exercise 10 - add_prefix and add_suffix

In [30]:
df = pd.DataFrame(np.random.randint(0, 10, size=(5, 3)), columns=list("ABC"))

df

Unnamed: 0,A,B,C
0,6,8,0
1,4,4,2
2,3,7,5
3,9,6,9
4,7,7,1


In [31]:
df.add_prefix("col_")

Unnamed: 0,col_A,col_B,col_C
0,6,8,0
1,4,4,2
2,3,7,5
3,9,6,9
4,7,7,1


In [32]:
df.add_suffix("_new")

Unnamed: 0,A_new,B_new,C_new
0,6,8,0
1,4,4,2
2,3,7,5
3,9,6,9
4,7,7,1
