# Data Wrangling - Handling dataframes with pandas

In [1]:
import pandas as pd

In [2]:
# Create a dataframe:

df = pd.DataFrame(columns = ["age", "name", "id"])
df["age"] = [32, 26, 20, 40, 17, 23]
df["name"] = ["Rob", "Cheesy", "Indie", "Sam", "Chris", "Ben"]
df["id"] = [2, 4, 7, 1, 3, 5]
df["dob"] = ['25-09-76', '12-02-79', '28-11-18', '02-03-11', '09-12-2006', '18-09-2002']

df

Unnamed: 0,age,name,id,dob
0,32,Rob,2,25-09-76
1,26,Cheesy,4,12-02-79
2,20,Indie,7,28-11-18
3,40,Sam,1,02-03-11
4,17,Chris,3,09-12-2006
5,23,Ben,5,18-09-2002


In [6]:
# Head of the dataframe

df.head(2)

Unnamed: 0,age,name,id,dob
0,32,Rob,2,25-09-76
1,26,Cheesy,4,12-02-79


In [4]:
# Tail of the dataframe

df.tail(2)

Unnamed: 0,age,name,id,dob
4,17,Chris,3,09-12-2006
5,23,Ben,5,18-09-2002


In [7]:
# Head and tail using print statements

print("Top rows")
print(df.head(2))

print("Bottom rows")
print(df.tail(2))

Top rows
   age    name  id       dob
0   32     Rob   2  25-09-76
1   26  Cheesy   4  12-02-79
Bottom rows
   age   name  id         dob
4   17  Chris   3  09-12-2006
5   23    Ben   5  18-09-2002


In [8]:
# To select contiguous portions of data. 
# Here using the iloc function which stands for integer location
# Remember this is in (row, column) pairs. 
# Note also that python is zero indexed and when selecting data first index is inclusive
# but the second is exclusive. 

print("Showing 2 columns and 3 rows, starting from the 2nd row and 2nd columns")

df_new = df.iloc[1:4, 1:3]

df_new

Showing 2 columns and 3 rows, starting from the 2nd row and 2nd columns


Unnamed: 0,name,id
1,Cheesy,4
2,Indie,7
3,Sam,1


In [9]:
# Selecting specific columns

print("Just showing id and age")

df_new2 = df[['id', 'age']]

df_new2 

Just showing id and age


Unnamed: 0,id,age
0,2,32
1,4,26
2,7,20
3,1,40
4,3,17
5,5,23


In [10]:
# Renaming columns 1

# The first way using the rename method:
df.rename({"id" : "col_1", "name" : "col_2"}, axis = "columns", inplace = True)

print(df)

#Here we have changed the column names one by one. 

   age   col_2  col_1         dob
0   32     Rob      2    25-09-76
1   26  Cheesy      4    12-02-79
2   20   Indie      7    28-11-18
3   40     Sam      1    02-03-11
4   17   Chris      3  09-12-2006
5   23     Ben      5  18-09-2002


In [11]:
# Renaming columns 2

df.columns = ["age", "name", "id", "dob"]

print(df)

# Here we have changed them all at once. This second bit of code has changed them back to
# the original column names. 

   age    name  id         dob
0   32     Rob   2    25-09-76
1   26  Cheesy   4    12-02-79
2   20   Indie   7    28-11-18
3   40     Sam   1    02-03-11
4   17   Chris   3  09-12-2006
5   23     Ben   5  18-09-2002


In [12]:
# Splitting a string column

# Here splitting the dob column into three using the '-' as a delimiter to split on. 

df[["day", "month", "year"]] = df["dob"].str.split("-", expand = True)

df

Unnamed: 0,age,name,id,dob,day,month,year
0,32,Rob,2,25-09-76,25,9,76
1,26,Cheesy,4,12-02-79,12,2,79
2,20,Indie,7,28-11-18,28,11,18
3,40,Sam,1,02-03-11,2,3,11
4,17,Chris,3,09-12-2006,9,12,2006
5,23,Ben,5,18-09-2002,18,9,2002


In [13]:
# Concatenate string columns

# Here we are creating a new dob variable organised m/d/y like in the US. 

df["mod_usdate"] = df["month"] + '-' + df["day"] + '-' + df["year"]

df

Unnamed: 0,age,name,id,dob,day,month,year,mod_usdate
0,32,Rob,2,25-09-76,25,9,76,09-25-76
1,26,Cheesy,4,12-02-79,12,2,79,02-12-79
2,20,Indie,7,28-11-18,28,11,18,11-28-18
3,40,Sam,1,02-03-11,2,3,11,03-02-11
4,17,Chris,3,09-12-2006,9,12,2006,12-09-2006
5,23,Ben,5,18-09-2002,18,9,2002,09-18-2002


In [14]:
# Filtering using multiple criteria 

df_new3 = df[(df["age"] < 25) & (df["id"] >= 3)]

df_new3

Unnamed: 0,age,name,id,dob,day,month,year,mod_usdate
2,20,Indie,7,28-11-18,28,11,18,11-28-18
4,17,Chris,3,09-12-2006,9,12,2006,12-09-2006
5,23,Ben,5,18-09-2002,18,9,2002,09-18-2002


In [15]:
# One hot encoding

# one hot encoding represents categorical data with zeros and ones. 
# In the below example it will take months (numerical from 1 to 12) and for a row
# where the month is Jan show a 1 under Mo_1 and then zero for the next 11 columns
# (Like a sparse matrix)

df_new4 = pd.get_dummies(df, prefix = "mo", prefix_sep = '_', columns = ["month"], drop_first = True)

df_new4 

Unnamed: 0,age,name,id,dob,day,year,mod_usdate,mo_03,mo_09,mo_11,mo_12
0,32,Rob,2,25-09-76,25,76,09-25-76,0,1,0,0
1,26,Cheesy,4,12-02-79,12,79,02-12-79,0,0,0,0
2,20,Indie,7,28-11-18,28,18,11-28-18,0,0,1,0
3,40,Sam,1,02-03-11,2,11,03-02-11,1,0,0,0
4,17,Chris,3,09-12-2006,9,2006,12-09-2006,0,0,0,1
5,23,Ben,5,18-09-2002,18,2002,09-18-2002,0,1,0,0


In [17]:
# Convert a continuous variable to a categorical variable

df["cat_age"] = pd.cut(df["age"].values, bins = [0, 10, 20, 30, 60], labels = ["child", "teen", "20's", "30's+"])

df

# The above code splits a continuous variable in categories based on the bins. If you have/ want
# n categories you have to add n + 1 boundaries for the bins. 
# The above treats anyone between zero and 30 years as in their 20's and anyone 30 to 60 years
# in their 30's (nonsense but just an example to illustrate). 

Unnamed: 0,age,name,id,dob,day,month,year,mod_usdate,cat_age
0,32,Rob,2,25-09-76,25,9,76,09-25-76,30's+
1,26,Cheesy,4,12-02-79,12,2,79,02-12-79,20's
2,20,Indie,7,28-11-18,28,11,18,11-28-18,teen
3,40,Sam,1,02-03-11,2,3,11,03-02-11,30's+
4,17,Chris,3,09-12-2006,9,12,2006,12-09-2006,teen
5,23,Ben,5,18-09-2002,18,9,2002,09-18-2002,20's


In [18]:
# Using the apply() method

# Here using the apply method to make the name column uppercase. 
# The lambda function is a one-line function in pandas.

df["name"] = df["name"].apply(lambda x: x.upper())

df

Unnamed: 0,age,name,id,dob,day,month,year,mod_usdate,cat_age
0,32,ROB,2,25-09-76,25,9,76,09-25-76,30's+
1,26,CHEESY,4,12-02-79,12,2,79,02-12-79,20's
2,20,INDIE,7,28-11-18,28,11,18,11-28-18,teen
3,40,SAM,1,02-03-11,2,3,11,03-02-11,30's+
4,17,CHRIS,3,09-12-2006,9,12,2006,12-09-2006,teen
5,23,BEN,5,18-09-2002,18,9,2002,09-18-2002,20's


In [19]:
# Using the apply() function 2nd example

# Here we are using the apply function to create a variable showing whether the participant id 
# is an odd or even number: 0 is even, 1 if odd

df["even_odd"] = df["id"].apply(lambda x: 0 if x % 2 == 0 else 1)

df

Unnamed: 0,age,name,id,dob,day,month,year,mod_usdate,cat_age,even_odd
0,32,ROB,2,25-09-76,25,9,76,09-25-76,30's+,0
1,26,CHEESY,4,12-02-79,12,2,79,02-12-79,20's,0
2,20,INDIE,7,28-11-18,28,11,18,11-28-18,teen,1
3,40,SAM,1,02-03-11,2,3,11,03-02-11,30's+,1
4,17,CHRIS,3,09-12-2006,9,12,2006,12-09-2006,teen,1
5,23,BEN,5,18-09-2002,18,9,2002,09-18-2002,20's,1
