# Sprint Project: Data Preprocessing
Prepared by `Kuhgi Jotojot`

## Prerequisites

In [3]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ------- -------------------------------- 2.1/11.1 MB 11.7 MB/s eta 0:00:01
   ------------- -------------------------- 3.7/11.1 MB 10.9 MB/s eta 0:00:01
   --------------------- ------------------ 6.0/11.1 MB 9.6 MB/s eta 0:00:01
   ------------------------------ --------- 8.4/11.1 MB 10.1 MB/s eta 0:00:01
   ------------------------------

In [2]:
# general libraries
import time
import warnings
import numpy as np
import pandas as pd

In [2]:
# OneHotEncoder for categorical data
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Mounting GDrive to the notebook
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/ESKWELABS DSF/Sprint 3 second to the last/Sprint Data/Family Income and Expenditure.csv')
df.head()

Unnamed: 0,Total Household Income,Region,Total Food Expenditure,Main Source of Income,Agricultural Household indicator,Bread and Cereals Expenditure,Total Rice Expenditure,Meat Expenditure,Total Fish and marine products Expenditure,Fruit Expenditure,...,Number of Refrigerator/Freezer,Number of Washing Machine,Number of Airconditioner,"Number of Car, Jeep, Van",Number of Landline/wireless telephones,Number of Cellular phone,Number of Personal Computer,Number of Stove with Oven/Gas Range,Number of Motorized Banca,Number of Motorcycle/Tricycle
0,11285,VII - Central Visayas,14709,Other sources of Income,1,4690,91,248,1759,236,...,0,0,0,0,0,0,0,0,0,0
1,11988,CAR,6781,Other sources of Income,0,4175,4175,930,360,581,...,0,0,0,0,0,0,0,0,0,0
2,12039,V - Bicol Region,9465,Other sources of Income,0,3176,3041,150,1410,336,...,0,0,0,0,0,0,0,0,0,0
3,12141,II - Cagayan Valley,6500,Other sources of Income,0,1851,1140,1310,1318,330,...,0,0,0,0,0,0,0,0,0,0
4,12911,VII - Central Visayas,15909,Enterpreneurial Activities,1,6716,96,1030,3316,285,...,0,0,0,0,0,0,0,0,0,0


## Data Cleaning

### 1. Dropping Duplicates

In [5]:
df = df.drop_duplicates()

### 2. Structural Fix (Renaming Column Names)

In [6]:
for col in df.columns:
  new_col = col.replace(' ', '_')
  df.rename(columns={col:new_col}, inplace=True)

### 3. Structural Fix (Substituting Null Values)

In [7]:
value = "No Occupation"
df['Household_Head_Occupation'] = df['Household_Head_Occupation'].fillna(value)

In [8]:
value = "Not Employed"
df['Household_Head_Class_of_Worker'] = df['Household_Head_Class_of_Worker'].fillna(value)

In [9]:
value = "No Toilet Facilities"
df['Toilet_Facilities'] = df['Toilet_Facilities'].fillna(value)

## Features Engineering

In [12]:
df.head()

Unnamed: 0,Total_Household_Income,Region,Total_Food_Expenditure,Main_Source_of_Income,Agricultural_Household_indicator,Bread_and_Cereals_Expenditure,Total_Rice_Expenditure,Meat_Expenditure,Total_Fish_and__marine_products_Expenditure,Fruit_Expenditure,...,Number_of_Refrigerator/Freezer,Number_of_Washing_Machine,Number_of_Airconditioner,"Number_of_Car,_Jeep,_Van",Number_of_Landline/wireless_telephones,Number_of_Cellular_phone,Number_of_Personal_Computer,Number_of_Stove_with_Oven/Gas_Range,Number_of_Motorized_Banca,Number_of_Motorcycle/Tricycle
0,11285,VII - Central Visayas,14709,Other sources of Income,1,4690,91,248,1759,236,...,0,0,0,0,0,0,0,0,0,0
1,11988,CAR,6781,Other sources of Income,0,4175,4175,930,360,581,...,0,0,0,0,0,0,0,0,0,0
2,12039,V - Bicol Region,9465,Other sources of Income,0,3176,3041,150,1410,336,...,0,0,0,0,0,0,0,0,0,0
3,12141,II - Cagayan Valley,6500,Other sources of Income,0,1851,1140,1310,1318,330,...,0,0,0,0,0,0,0,0,0,0
4,12911,VII - Central Visayas,15909,Enterpreneurial Activities,1,6716,96,1030,3316,285,...,0,0,0,0,0,0,0,0,0,0


### 1. Dropping Collinear Features

In [21]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# One-hot encode the 'Color' column
encoded_df = pd.get_dummies(df, columns=['Region'])


# print(encoded_df)
encoded_df.head()

Unnamed: 0,Total_Household_Income,Total_Food_Expenditure,Main_Source_of_Income,Agricultural_Household_indicator,Bread_and_Cereals_Expenditure,Total_Rice_Expenditure,Meat_Expenditure,Total_Fish_and__marine_products_Expenditure,Fruit_Expenditure,Vegetables_Expenditure,...,Region_IVB - MIMAROPA,Region_IX - Zasmboanga Peninsula,Region_NCR,Region_V - Bicol Region,Region_VI - Western Visayas,Region_VII - Central Visayas,Region_VIII - Eastern Visayas,Region_X - Northern Mindanao,Region_XI - Davao Region,Region_XII - SOCCSKSARGEN
0,11285,14709,Other sources of Income,1,4690,91,248,1759,236,1335,...,False,False,False,False,False,True,False,False,False,False
1,11988,6781,Other sources of Income,0,4175,4175,930,360,581,525,...,False,False,False,False,False,False,False,False,False,False
2,12039,9465,Other sources of Income,0,3176,3041,150,1410,336,241,...,False,False,False,True,False,False,False,False,False,False
3,12141,6500,Other sources of Income,0,1851,1140,1310,1318,330,560,...,False,False,False,False,False,False,False,False,False,False
4,12911,15909,Enterpreneurial Activities,1,6716,96,1030,3316,285,1831,...,False,False,False,False,False,True,False,False,False,False


## Trash Code Lines Below (consider deleting)

In [None]:
# Define categorical variables
categorical_variables_list = ['Region', 'Main Source of Income', 'Agricultural Household indicator', 'Household Head Sex', 'Household Head Marital Status', 'Household Head Highest Grade Completed', 'Household Head Occupation', 'Household Head Class of Worker', 'Type of Household', 'Type of Building/House', 'Type of Roof', 'Type of Walls', 'Tenure Status', 'Toilet Facilities', 'Main Source of Water Supply']
# OneHotEncoding some of these categorical variables might result into 'curse of dimensionality'
# Some categorical values in this dataset have a lot of categories

# Should we use PCA in these categorical values?
# Another issue we will bring up: Incomplete records
# By dropping the incomplete records with dropna() function, are we misrepresenting the data?
# We only dropped like 8956 records with the method

Links:
https://www.datacamp.com/tutorial/one-hot-encoding-python-tutorial
