# Technical Assessment Test for IHiS 

The Technical Assessment working file is following the order of the instruction document.

**Table of Contents**
</b>
1. Data Preparation
2. Task 01 - New NRIC
3. Task 02 - Coding - Gender
4. Task 03 - Age Group
5. Task 04 - Study Number
6. Task 05 - Worksheet: Study Data
7. Task 06 - Worksheet: Exception List
8. Task 07 - Pivot Table (Mainly completed in Excel Pivot saved separately)



## Data Preparation

In [None]:
#import python library package
import pandas as pd
import numpy as np


### Import/Read Demographics Worksheet

In [None]:
#Reading data from Excel Sheet name "Demographics"
data = pd.read_excel("01_source_files/Technical Test.xls", sheet_name="Demographics")
data.head()

In [None]:
#To understand the size of the dataset
print(data.shape, '\n')

# Quick review of the dataset details, types etc
data.info()



In [None]:
data.isna().sum()

### Import/Read Study Data Worksheet

In [None]:
#Reading data from Excel Sheet name "Study Data"
study_data =  pd.read_excel("01_source_files/Technical Test.xls", sheet_name="Study Data")
study_data.head()

In [None]:
#To understand the size of the dataset
print(study_data.shape, '\n')

# Quick review of the dataset details, types etc
study_data.info()

### Import/Read Extra Information Worksheet

In [None]:
#Reading data from Excel Sheet name "Extra information"
data_extra =  pd.read_excel("01_source_files/Technical Test.xls", sheet_name="Extra information")
data_extra.head()

In [None]:
#To understand the size of the dataset
print(data_extra.shape, '\n')

# Quick review of the dataset details, types etc
data_extra.info()

## New NRIC

In [None]:
#Noticed some empty space in between, removed them by using .replace()
data["NRIC"] = data.NRIC.str.replace(" ","")
data.head()

In [None]:
#creating a separate dataframe just with the NRIC value for easier aggregate the data, then apply it back to the original dataset
nric_list = pd.DataFrame(data["NRIC"])
print(nric_list.shape)
nric_list.head()

In [None]:
#using for loop and if else method to inspect the dataset, also correcting the messed up NRIC underneath
new_preprocessed_nric = []

for ic in nric_list["NRIC"]:
    if (ic[0] and ic[-1]).isalpha() == True:
        new_preprocessed_nric.append(ic)
    else:

        start_char = []
        end_char = []
        numbers = []

        if (ic[0]).isdigit() == False:
            start_char.append(ic[0])
        if (ic[1]).isdigit() == False:
            end_char.append(ic[1])
        if (ic[2:]).isdigit() == True:
            numbers.append(ic[2:])

        new_preprocessed_nric.append("".join((start_char+numbers+end_char)))        

In [None]:
# create new dataframe with the list contained revised/corrected NRIC values
new_nric_list = pd.DataFrame(new_preprocessed_nric, columns=["NRIC"])
new_nric_list.head()

In [None]:
# apply the newly formatted NRIC back to the original dataset
data["New NRIC"] = new_nric_list["NRIC"]
data.head()

## Coding - Gender

In [None]:
#Function to label/code Gender Group

def gender_code(gender_cols):
   if gender_cols == "F" :
      return 1
   else:
      return 2


In [None]:
#Apply the function with the return values to Gender Group Column
data["Coding - Gender"] = data["Gender"].apply (lambda gender_cols: gender_code(gender_cols))
data.head()

In [None]:
# display the unique/ distinct value of the new gender labels
np.sort(data["Coding - Gender"].unique())

## Age Group

In [None]:
#Function to define and categories Age Group

def age_group(age_cols):
   if age_cols <= 9 :
      return "G1"
   if age_cols >= 10 and age_cols <= 19:
      return "G2"
   if age_cols >= 20 and age_cols <= 29:
      return "G3"
   if age_cols >= 30 and age_cols <= 39:
      return "G4"  
   if age_cols >= 40 and age_cols <= 49:
      return "G5"
   if age_cols >= 50 and age_cols <= 59:
      return "G6"
   if age_cols >= 60 and age_cols <= 69:
      return "G7"  
   if age_cols >= 70 and age_cols <= 79:
      return "G8"
   if age_cols >= 80 and age_cols <= 89:
      return "G9"
   if age_cols >= 90:
      return "G0"        


In [None]:
#Apply the function with the return values to Age Group Column
data["Age Group"] = data["Age"].apply (lambda age_cols: age_group(age_cols))
data.head()

In [None]:
# display the unique/ distinct value of the new age group labels
np.sort(data["Age Group"].unique())

## Study Number

In [None]:
# Combine the Age Group value/string with the Gender coding to form the new Study Number per requirement
data["Study Number"] = data["Age Group"] + " - " + data["Coding - Gender"].astype(str)
data.head(10)

## Worksheet: Study Data

In [None]:
# Convert to String to prevent the contact number shows as floating number
data_extra["Contact Number"] = data_extra["Contact Number"].astype(str)

### Combination of Worksheets for Study Data

In [None]:
#Merging/combine the data in Demographics worksheet and Extra Information worksheet
demo_n_extra = data.merge(data_extra, how ="left", on="NRIC")
demo_n_extra.head(10)

In [None]:
#To understand the size of the dataset
print(demo_n_extra.shape, '\n')

# Quick review of the dataset details, types etc
demo_n_extra.info()


In [None]:
#random check on those columns shows as NaN is indeed not found match in Extra Information Worksheet based on Demographics Worksheet
data_extra[data_extra["NRIC"] == "SA9787544"]

In [None]:
#Quick check how many rows contain NaN
demo_n_extra.isna().sum()

In [None]:
# Renaming the NRIC column so that can match with the column naming in Study Data for the merge condition
demo_n_extra = demo_n_extra.rename(columns={"NRIC":"Old NRIC"})
demo_n_extra.head()

### Final Combine to Study Data

In [None]:
#Merge or Join data based on columns name
new_data = study_data.merge(demo_n_extra, how ="right", on="Old NRIC")

#drop the unnecessary columns
new_data.drop(columns=["Study Number_x","New NRIC_x", "Gender_x", "Age_x", "Marital Status_x", "Address 1_x", "Address 2_x", "Contact Number_x", "Ethnic Group_x"], inplace=True)
new_data.head()

In [None]:
#To understand the size of the dataset. We can see it's still 117 rows exist.
print(new_data.shape, '\n')

# Quick review of the dataset details, types etc
new_data.info()

In [None]:
#Rename the columns without the suffix
new_data = new_data.rename(columns={"Gender_y":"Gender",
                                    "Age_y": "Age",
                                    "Marital Status_y": "Marital Status",	
                                    "New NRIC_y": "New NRIC",
                                    "Study Number_y": "Study Number",
                                    "Address 1_y": "Address 1",
                                    "Address 2_y": "Address 2",
                                    "Contact Number_y":	"Contact Number",
                                    "Ethnic Group_y": "Ethnic Group"
                                    })
new_data.head()

In [None]:
#Reoder the dataset based on given requirement.
new_data = new_data[["Study Number", "Old NRIC", "New NRIC", "Gender", "Age", "Marital Status", "Address 1", "Address 2", "Contact Number", "Ethnic Group"]]
new_data.head()

In [None]:
#export to excel, stored under 02_exported_worksheet folder
new_data.to_excel("02_exported_worksheet/study_data_final.xlsx")

## Worksheet: Exception List

### List of unique NRIC

In [None]:
# Number of Unique NRIC (New Formatted)
unique_nric = pd.DataFrame(new_data["New NRIC"].unique(), columns=["NRIC"])
unique_nric.index = unique_nric.index + 1
unique_nric = unique_nric.rename_axis("S/N").reset_index()

unique_nric.head(15)

In [None]:
#creating new dataframe so that it can be export to excel based on the given requirement.
unique_nric_final = pd.DataFrame(unique_nric[["S/N","NRIC"]]).set_index("S/N")

unique_nric_final.head(15)

In [None]:
#export to excel, stored under 02_exported_worksheet folder
unique_nric_final.to_excel("02_exported_worksheet/unique_num_nric.xlsx")

### Number of NRIC not found in Extra Information

In [None]:
#Number of NRIC under Demographics that are found in Extra Information workshee.

nric_notin = pd.DataFrame(data[~data["NRIC"].isin(data_extra["NRIC"])])
print(nric_notin.shape)

nric_notin.reset_index(drop=True, inplace=True)
nric_notin.index = nric_notin.index + 1

#creating new dataframe so that it can be export to excel based on the given requirement.
nric_notin_final = pd.DataFrame(nric_notin[["NRIC"]].rename_axis("S/N"))
nric_notin_final.head()


In [None]:
#export to excel, stored under 02_exported_worksheet folder
nric_notin_final.to_excel("02_exported_worksheet/nric_not_found.xlsx")

## Worksheet: Pivot Table

Only prepare the necessary dataframe to be export to excel in order to work in Excel for the Pivot Table Feature

In [None]:
#Checking the necessary columns - Age Group, Gender and Marital Status are sitting in data variable well before exporting to excel
data.head(15)

In [None]:
#export to excel, stored under 02_exported_worksheet folder
data.to_excel("02_exported_worksheet/data.xlsx")
