## Data Clean and Prepocessing

In [51]:
#Importing the useful libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

In [52]:
#Loading the dataset 
df = pd.read_csv(r"C:\Users\User\Downloads\CPSData.csv\CPSData.csv")
df.info

<bound method DataFrame.info of         PeopleInHousehold Region    State  MetroAreaCode  Age        Married  \
0                       1  South  Alabama        26620.0   85        Widowed   
1                       3  South  Alabama        13820.0   21  Never Married   
2                       3  South  Alabama        13820.0   37  Never Married   
3                       3  South  Alabama        13820.0   18  Never Married   
4                       3  South  Alabama        26620.0   52        Widowed   
...                   ...    ...      ...            ...  ...            ...   
131297                  5   West  Wyoming            NaN   17  Never Married   
131298                  5   West  Wyoming            NaN   37       Divorced   
131299                  3   West  Wyoming            NaN   58        Married   
131300                  3   West  Wyoming            NaN   53        Married   
131301                  3   West  Wyoming            NaN   14            NaN   

       

In [53]:
#Printing the first five rows of the dataset
df.head()

Unnamed: 0,PeopleInHousehold,Region,State,MetroAreaCode,Age,Married,Sex,Education,Race,Hispanic,CountryOfBirthCode,Citizenship,EmploymentStatus,Industry
0,1,South,Alabama,26620.0,85,Widowed,Female,Associate degree,White,0,57,"Citizen, Native",Retired,
1,3,South,Alabama,13820.0,21,Never Married,Male,High school,Black,0,57,"Citizen, Native",Unemployed,Professional and business services
2,3,South,Alabama,13820.0,37,Never Married,Female,High school,Black,0,57,"Citizen, Native",Disabled,
3,3,South,Alabama,13820.0,18,Never Married,Male,No high school diploma,Black,0,57,"Citizen, Native",Not in Labor Force,
4,3,South,Alabama,26620.0,52,Widowed,Female,Associate degree,White,0,57,"Citizen, Native",Employed,Professional and business services


In [54]:
# View the columns of the DataFrame
print(df.columns)
print("The dataset size : ",df.size)

Index(['PeopleInHousehold', 'Region', 'State', 'MetroAreaCode', 'Age',
       'Married', 'Sex', 'Education', 'Race', 'Hispanic', 'CountryOfBirthCode',
       'Citizenship', 'EmploymentStatus', 'Industry'],
      dtype='object')
The dataset size :  1838228


In [55]:
# Print the count of missing values (null values) in each column of the DataFrame
print(df.isnull().sum())

PeopleInHousehold         0
Region                    0
State                     0
MetroAreaCode         34238
Age                       0
Married               25338
Sex                       0
Education             25338
Race                      0
Hispanic                  0
CountryOfBirthCode        0
Citizenship               0
EmploymentStatus      25789
Industry              65060
dtype: int64


In [56]:
# Handle missing values without using inplace
df['MetroAreaCode'] = df['MetroAreaCode'].fillna(df['MetroAreaCode'].mode()[0])

# For other categorical columns, fill missing values with 'Unknown'
df['Married'] = df['Married'].fillna('Unknown')
df['Education'] = df['Education'].fillna('Unknown')
df['EmploymentStatus'] = df['EmploymentStatus'].fillna('Unknown')
df['Industry'] = df['Industry'].fillna('Unknown')

In [57]:
print(df.isnull().sum())

PeopleInHousehold     0
Region                0
State                 0
MetroAreaCode         0
Age                   0
Married               0
Sex                   0
Education             0
Race                  0
Hispanic              0
CountryOfBirthCode    0
Citizenship           0
EmploymentStatus      0
Industry              0
dtype: int64


In [58]:
# Select multiple rows by position
print(df.loc[1:5])# rows 1 to 5

   PeopleInHousehold Region    State  MetroAreaCode  Age        Married  \
1                  3  South  Alabama        13820.0   21  Never Married   
2                  3  South  Alabama        13820.0   37  Never Married   
3                  3  South  Alabama        13820.0   18  Never Married   
4                  3  South  Alabama        26620.0   52        Widowed   
5                  3  South  Alabama        26620.0   24  Never Married   

      Sex               Education   Race  Hispanic  CountryOfBirthCode  \
1    Male             High school  Black         0                  57   
2  Female             High school  Black         0                  57   
3    Male  No high school diploma  Black         0                  57   
4  Female        Associate degree  White         0                  57   
5    Male       Bachelor's degree  White         0                  57   

       Citizenship    EmploymentStatus                            Industry  
1  Citizen, Native         

In [59]:
# 2. Feature engineering:
# Create a new feature for Age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 18, 35, 50, 65, 100], labels=['Youth', 'Young Adult', 'Adult', 'Senior', 'Elderly'])

# Create a feature for household size categories
df['HouseholdSizeCategory'] = pd.cut(df['PeopleInHousehold'], bins=[0, 2, 4, 6, 10], labels=['Small', 'Medium', 'Large', 'Very Large'])

In [60]:
# 3. Encoding categorical variables:
# Convert categorical variables into numerical format using one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# 4. Scaling  numerical features:
scaler = StandardScaler()

# Identifying numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# The scaling of  numerical columns
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

#Displaying the cleaned and processed dataset
df_cleaned = df_encoded.head()
df_cleaned

# Saving the processed dataset to a CSV file 
df_encoded.to_csv('cleaned_CPSData.csv', index=False)