### Transforming Data (Feature Engineering)

* apply()
* map()
* assign()
* replace(old, new)
* rename(columns = {old : new})
* sort_values()
* sort_index()
* reset_index()
* rank()
* Reorder columns

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("raw_data.csv")
df

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [5]:
df2 = df.copy()

In [6]:
# apply function

df2["tax"] = df2["income"].apply(lambda x : "20%" if x >= 50000 else "10%")
df2

Unnamed: 0,id,name,age,country,gender,income,tax
0,1,John Doe,29.0,USA,Male,55000.0,20%
1,1,John Doe,29.0,USA,Male,55000.0,20%
2,2,Jane Smith,,Canada,Female,62000.0,20%
3,3,Alex,,USA,Unknown,47000.0,10%
4,4,Maria Garcia,34.0,Spain,Female,,10%
5,5,Li Wei,27.0,China,Male,51000.0,20%
6,6,,45.0,India,Female,73000.0,20%
7,7,Ahmed Khan,38.0,,Male,68000.0,20%
8,8,Rachel Lee,29.0,USA,Female,62000.0,20%
9,9,Carlos Ruiz,,Mexico,Male,45000.0,10%


In [7]:
# map function

gender_map = {"Male" : "M", "Female" : "F"}
df2["gender"] = df2["gender"].map(gender_map)
df2

Unnamed: 0,id,name,age,country,gender,income,tax
0,1,John Doe,29.0,USA,M,55000.0,20%
1,1,John Doe,29.0,USA,M,55000.0,20%
2,2,Jane Smith,,Canada,F,62000.0,20%
3,3,Alex,,USA,,47000.0,10%
4,4,Maria Garcia,34.0,Spain,F,,10%
5,5,Li Wei,27.0,China,M,51000.0,20%
6,6,,45.0,India,F,73000.0,20%
7,7,Ahmed Khan,38.0,,M,68000.0,20%
8,8,Rachel Lee,29.0,USA,F,62000.0,20%
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%


In [8]:
# assign function

df2 = df2.assign(new_income = df2["income"] * 1.1)
df2


Unnamed: 0,id,name,age,country,gender,income,tax,new_income
0,1,John Doe,29.0,USA,M,55000.0,20%,60500.0
1,1,John Doe,29.0,USA,M,55000.0,20%,60500.0
2,2,Jane Smith,,Canada,F,62000.0,20%,68200.0
3,3,Alex,,USA,,47000.0,10%,51700.0
4,4,Maria Garcia,34.0,Spain,F,,10%,
5,5,Li Wei,27.0,China,M,51000.0,20%,56100.0
6,6,,45.0,India,F,73000.0,20%,80300.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,74800.0
8,8,Rachel Lee,29.0,USA,F,62000.0,20%,68200.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,49500.0


In [9]:
# replace function

df2["country"] = df["country"].replace("USA", "US")
df2

Unnamed: 0,id,name,age,country,gender,income,tax,new_income
0,1,John Doe,29.0,US,M,55000.0,20%,60500.0
1,1,John Doe,29.0,US,M,55000.0,20%,60500.0
2,2,Jane Smith,,Canada,F,62000.0,20%,68200.0
3,3,Alex,,US,,47000.0,10%,51700.0
4,4,Maria Garcia,34.0,Spain,F,,10%,
5,5,Li Wei,27.0,China,M,51000.0,20%,56100.0
6,6,,45.0,India,F,73000.0,20%,80300.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,74800.0
8,8,Rachel Lee,29.0,US,F,62000.0,20%,68200.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,49500.0


In [10]:
# rename function

df2 = df2.rename(columns={"income" : "salary"}) # to change column name
df2.rename(index={1 : "first"}) # to change row name

Unnamed: 0,id,name,age,country,gender,salary,tax,new_income
0,1,John Doe,29.0,US,M,55000.0,20%,60500.0
first,1,John Doe,29.0,US,M,55000.0,20%,60500.0
2,2,Jane Smith,,Canada,F,62000.0,20%,68200.0
3,3,Alex,,US,,47000.0,10%,51700.0
4,4,Maria Garcia,34.0,Spain,F,,10%,
5,5,Li Wei,27.0,China,M,51000.0,20%,56100.0
6,6,,45.0,India,F,73000.0,20%,80300.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,74800.0
8,8,Rachel Lee,29.0,US,F,62000.0,20%,68200.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,49500.0


In [None]:
# sort function

# df["income"].sort_values()
df.sort_values("income") # sorting in ascending order

Unnamed: 0,id,name,age,country,gender,income
9,9,Carlos Ruiz,,Mexico,Male,45000.0
3,3,Alex,,USA,Unknown,47000.0
5,5,Li Wei,27.0,China,Male,51000.0
1,1,John Doe,29.0,USA,Male,55000.0
0,1,John Doe,29.0,USA,Male,55000.0
10,10,Emily Davis,31.0,USA,,58000.0
2,2,Jane Smith,,Canada,Female,62000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
6,6,,45.0,India,Female,73000.0


In [None]:
df.sort_values(["income", "age"]) # shorting priority will be income but when there is 2 same income then it will check age

Unnamed: 0,id,name,age,country,gender,income
9,9,Carlos Ruiz,,Mexico,Male,45000.0
3,3,Alex,,USA,Unknown,47000.0
5,5,Li Wei,27.0,China,Male,51000.0
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
10,10,Emily Davis,31.0,USA,,58000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
2,2,Jane Smith,,Canada,Female,62000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
6,6,,45.0,India,Female,73000.0


In [17]:
sorted_df = df.sort_values("income", ascending=False) # sorting in descending order
sorted_df

Unnamed: 0,id,name,age,country,gender,income
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
2,2,Jane Smith,,Canada,Female,62000.0
10,10,Emily Davis,31.0,USA,,58000.0
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
5,5,Li Wei,27.0,China,Male,51000.0
3,3,Alex,,USA,Unknown,47000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [18]:
reset_idx =sorted_df.reset_index()
reset_idx

Unnamed: 0,index,id,name,age,country,gender,income
0,6,6,,45.0,India,Female,73000.0
1,7,7,Ahmed Khan,38.0,,Male,68000.0
2,8,8,Rachel Lee,29.0,USA,Female,62000.0
3,2,2,Jane Smith,,Canada,Female,62000.0
4,10,10,Emily Davis,31.0,USA,,58000.0
5,0,1,John Doe,29.0,USA,Male,55000.0
6,1,1,John Doe,29.0,USA,Male,55000.0
7,5,5,Li Wei,27.0,China,Male,51000.0
8,3,3,Alex,,USA,Unknown,47000.0
9,9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [None]:
reset_idx =sorted_df.reset_index(drop = True) # if we don't want to preserve the original index
reset_idx

Unnamed: 0,id,name,age,country,gender,income
0,6,,45.0,India,Female,73000.0
1,7,Ahmed Khan,38.0,,Male,68000.0
2,8,Rachel Lee,29.0,USA,Female,62000.0
3,2,Jane Smith,,Canada,Female,62000.0
4,10,Emily Davis,31.0,USA,,58000.0
5,1,John Doe,29.0,USA,Male,55000.0
6,1,John Doe,29.0,USA,Male,55000.0
7,5,Li Wei,27.0,China,Male,51000.0
8,3,Alex,,USA,Unknown,47000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [None]:
# rank method 

sorted_df["ranking"] = sorted_df["income"].rank() # ranking in ascending order
sorted_df # when there is tie it return the average of that value (like when 5 and 4 is tie then it will return 4.5)

Unnamed: 0,id,name,age,country,gender,income,ranking
6,6,,45.0,India,Female,73000.0,10.0
7,7,Ahmed Khan,38.0,,Male,68000.0,9.0
8,8,Rachel Lee,29.0,USA,Female,62000.0,7.5
2,2,Jane Smith,,Canada,Female,62000.0,7.5
10,10,Emily Davis,31.0,USA,,58000.0,6.0
0,1,John Doe,29.0,USA,Male,55000.0,4.5
1,1,John Doe,29.0,USA,Male,55000.0,4.5
5,5,Li Wei,27.0,China,Male,51000.0,3.0
3,3,Alex,,USA,Unknown,47000.0,2.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0,1.0


In [21]:
sorted_df["ranking"] = sorted_df["income"].rank(ascending=False) # ranking in decending order
sorted_df

Unnamed: 0,id,name,age,country,gender,income,ranking
6,6,,45.0,India,Female,73000.0,1.0
7,7,Ahmed Khan,38.0,,Male,68000.0,2.0
8,8,Rachel Lee,29.0,USA,Female,62000.0,3.5
2,2,Jane Smith,,Canada,Female,62000.0,3.5
10,10,Emily Davis,31.0,USA,,58000.0,5.0
0,1,John Doe,29.0,USA,Male,55000.0,6.5
1,1,John Doe,29.0,USA,Male,55000.0,6.5
5,5,Li Wei,27.0,China,Male,51000.0,8.0
3,3,Alex,,USA,Unknown,47000.0,9.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0,10.0


In [22]:
# resolve tie

sorted_df["ranking"] = sorted_df["income"].rank(ascending=False, method="min")
sorted_df

Unnamed: 0,id,name,age,country,gender,income,ranking
6,6,,45.0,India,Female,73000.0,1.0
7,7,Ahmed Khan,38.0,,Male,68000.0,2.0
8,8,Rachel Lee,29.0,USA,Female,62000.0,3.0
2,2,Jane Smith,,Canada,Female,62000.0,3.0
10,10,Emily Davis,31.0,USA,,58000.0,5.0
0,1,John Doe,29.0,USA,Male,55000.0,6.0
1,1,John Doe,29.0,USA,Male,55000.0,6.0
5,5,Li Wei,27.0,China,Male,51000.0,8.0
3,3,Alex,,USA,Unknown,47000.0,9.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0,10.0
