<a href="https://colab.research.google.com/github/Rodrigofch7/Data-Engineering/blob/main/US_CENSUS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# PACKAGES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# DATA: https://www.kaggle.com/datasets/muonneutrino/us-census-demographic-data

# Introduction:
# In this project, we aim to predict incomes based on US census data from two different years: 2015 and 2017.
# We will use machine learning techniques to train a model on the 2015 data and evaluate its performance on
# the 2017 data.

In [None]:
df = pd.read_csv('/content/acs2015_county_data.csv')
sorted(list(df.columns))

['Asian',
 'Black',
 'Carpool',
 'CensusId',
 'ChildPoverty',
 'Citizen',
 'Construction',
 'County',
 'Drive',
 'Employed',
 'FamilyWork',
 'Hispanic',
 'Income',
 'IncomeErr',
 'IncomePerCap',
 'IncomePerCapErr',
 'MeanCommute',
 'Men',
 'Native',
 'Office',
 'OtherTransp',
 'Pacific',
 'Poverty',
 'PrivateWork',
 'Production',
 'Professional',
 'PublicWork',
 'SelfEmployed',
 'Service',
 'State',
 'TotalPop',
 'Transit',
 'Unemployment',
 'Walk',
 'White',
 'Women',
 'WorkAtHome']

In [None]:
print(df.dtypes)


CensusId             int64
State               object
County              object
TotalPop             int64
Men                  int64
Women                int64
Hispanic           float64
White              float64
Black              float64
Native             float64
Asian              float64
Pacific            float64
Citizen              int64
Income             float64
IncomeErr          float64
IncomePerCap         int64
IncomePerCapErr      int64
Poverty            float64
ChildPoverty       float64
Professional       float64
Service            float64
Office             float64
Construction       float64
Production         float64
Drive              float64
Carpool            float64
Transit            float64
Walk               float64
OtherTransp        float64
WorkAtHome         float64
MeanCommute        float64
Employed             int64
PrivateWork        float64
PublicWork         float64
SelfEmployed       float64
FamilyWork         float64
Unemployment       float64
d

In [None]:
df[['ChildPoverty',
 'Citizen',
 'Construction',
 'County',
 'Drive',
 'Employed',
 'FamilyWork',
 'Hispanic',
 'Income',
 'IncomeErr',
 'IncomePerCap',
 'IncomePerCapErr']]

Unnamed: 0,ChildPoverty,Citizen,Construction,County,Drive,Employed,FamilyWork,Hispanic,Income,IncomeErr,IncomePerCap,IncomePerCapErr
0,18.6,40725,8.6,Autauga,87.5,23986,0.0,2.6,51281.0,2391.0,24974,1080
1,19.2,147695,10.8,Baldwin,84.7,85953,0.4,4.5,50254.0,1263.0,27317,711
2,45.3,20714,10.8,Barbour,83.8,8597,0.1,4.6,32964.0,2973.0,16824,798
3,27.9,17495,19.0,Bibb,83.2,8294,0.4,2.2,38678.0,3995.0,18431,1618
4,27.2,42345,13.5,Blount,84.9,22189,0.4,8.6,45813.0,3141.0,20532,708
...,...,...,...,...,...,...,...,...,...,...,...,...
3215,56.1,43656,9.2,Vega Baja,89.1,13660,0.0,96.4,16948.0,1234.0,9102,538
3216,58.1,7085,15.7,Vieques,69.1,2860,0.3,96.7,18104.0,3771.0,8821,939
3217,68.3,18458,13.9,Villalba,82.0,6795,0.2,99.7,17818.0,1255.0,8420,486
3218,62.1,27924,12.8,Yabucoa,86.0,8083,0.0,99.8,15627.0,1836.0,7960,512


In [None]:
df2 = pd.read_csv('/content/acs2017_county_data.csv')
sorted(list(df2.columns))

['Asian',
 'Black',
 'Carpool',
 'ChildPoverty',
 'Construction',
 'County',
 'CountyId',
 'Drive',
 'Employed',
 'FamilyWork',
 'Hispanic',
 'Income',
 'IncomeErr',
 'IncomePerCap',
 'IncomePerCapErr',
 'MeanCommute',
 'Men',
 'Native',
 'Office',
 'OtherTransp',
 'Pacific',
 'Poverty',
 'PrivateWork',
 'Production',
 'Professional',
 'PublicWork',
 'SelfEmployed',
 'Service',
 'State',
 'TotalPop',
 'Transit',
 'Unemployment',
 'VotingAgeCitizen',
 'Walk',
 'White',
 'Women',
 'WorkAtHome']

In [None]:
print(df2.dtypes)

CountyId              int64
State                object
County               object
TotalPop              int64
Men                   int64
Women                 int64
Hispanic            float64
White               float64
Black               float64
Native              float64
Asian               float64
Pacific             float64
VotingAgeCitizen      int64
Income                int64
IncomeErr             int64
IncomePerCap          int64
IncomePerCapErr       int64
Poverty             float64
ChildPoverty        float64
Professional        float64
Service             float64
Office              float64
Construction        float64
Production          float64
Drive               float64
Carpool             float64
Transit             float64
Walk                float64
OtherTransp         float64
WorkAtHome          float64
MeanCommute         float64
Employed              int64
PrivateWork         float64
PublicWork          float64
SelfEmployed        float64
FamilyWork          

In [None]:
# Drop unnecessary columns and stop data leakage
columns_to_drop = ['CensusId',
                   'County',
                   'Citizen',
                   'IncomeErr',
                   'IncomePerCap',
                   'IncomePerCapErr']

df = df.drop(columns=columns_to_drop)

# Encode the 'State' column (assuming it represents different states)
df['State'] = pd.factorize(df['State'])[0]

# Define the target variable you want to predict
target_variable = 'Income'  # Replace this with the column name you want to predict

# Separate the features (X) and the target variable (y)
X = df.drop(columns=[target_variable])
y = df[target_variable]

df.dropna(inplace=True)

In [None]:
# Drop unnecessary columns and stop data leakage
columns_to_drop = ['County',
                   'IncomeErr',
                   'IncomePerCap',
                   'IncomePerCapErr']
df2 = df2.drop(columns=columns_to_drop)

# Encode the 'State' column (assuming it represents different states)
df2['State'] = pd.factorize(df2['State'])[0]

# Define the target variable you want to predict
target_variable = 'Income'

# Separate the features (X) and the target variable (y)
X = df2.drop(columns=[target_variable])
y = df2[target_variable]

df2.dropna(inplace=True)

In [None]:
# Drop the additional columns from the 2017 data (df2)
columns_to_drop = ['CountyId', 'VotingAgeCitizen', 'Citizen']
df2 = df2.drop(columns=columns_to_drop, errors='ignore')  # errors='ignore' prevents errors if some columns are not found

# Separate the features (X) and the target variable (y) for both datasets
X_train = df.drop(columns=[target_variable])
y_train = df[target_variable]

X_test = df2.drop(columns=[target_variable])
y_test = df2[target_variable]

# Initialize the models
random_forest_model = RandomForestRegressor(random_state=42)
linear_regression_model = LinearRegression()
gradient_boosting_model = GradientBoostingRegressor(random_state=42)

# Train the models on the 2015 data
random_forest_model.fit(X_train, y_train)
linear_regression_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)

# Make predictions on the 2017 data
rf_predictions = random_forest_model.predict(X_test)
lr_predictions = linear_regression_model.predict(X_test)
gb_predictions = gradient_boosting_model.predict(X_test)

# Evaluate the models using Mean Squared Error (MSE) and R-squared (R2) metrics on the 2017 data
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

gb_mse = mean_squared_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

# Display the results
print("Random Forest Regressor:")
print(f"MSE: {rf_mse}")
print(f"R-squared: {rf_r2}")

print("\nLinear Regression:")
print(f"MSE: {lr_mse}")
print(f"R-squared: {lr_r2}")

print("\nGradient Boosting Regressor:")
print(f"MSE: {gb_mse}")
print(f"R-squared: {gb_r2}")


Random Forest Regressor:
MSE: 20905636.109176483
R-squared: 0.8914136905109725

Linear Regression:
MSE: 35402353.31978266
R-squared: 0.8161160524297842

Gradient Boosting Regressor:
MSE: 21853767.570898447
R-squared: 0.8864889852400512


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming you already have X_train, y_train, X_test, y_test defined

# Initialize the models
random_forest_model = RandomForestRegressor(random_state=42)
linear_regression_model = LinearRegression()
gradient_boosting_model = GradientBoostingRegressor(random_state=42)

# List of models
models = [random_forest_model, linear_regression_model, gradient_boosting_model]

# Cross-validation using 5 folds
cv = 5

# Perform cross-validation and compute MSE and R2
for model in models:
    model_name = model.__class__.__name__
    mse_scores = -cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

    print(f"{model_name} Cross-Validation Results:")
    print(f"Mean MSE: {np.mean(mse_scores)}")
    print(f"Mean R-squared: {np.mean(r2_scores)}")
    print("\n")

# Fit the models on the entire training data
random_forest_model.fit(X_train, y_train)
linear_regression_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)

# Make predictions on the test data
rf_predictions = random_forest_model.predict(X_test)
lr_predictions = linear_regression_model.predict(X_test)
gb_predictions = gradient_boosting_model.predict(X_test)

# Evaluate the models on the test data
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

gb_mse = mean_squared_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

# Display the test set results
print("Test Set Results:")
print("Random Forest Regressor:")
print(f"MSE: {rf_mse}")
print(f"R-squared: {rf_r2}")

print("\nLinear Regression:")
print(f"MSE: {lr_mse}")
print(f"R-squared: {lr_r2}")

print("\nGradient Boosting Regressor:")
print(f"MSE: {gb_mse}")
print(f"R-squared: {gb_r2}")


RandomForestRegressor Cross-Validation Results:
Mean MSE: 24487188.924143057
Mean R-squared: 0.8454009412230146


LinearRegression Cross-Validation Results:
Mean MSE: 31732657.64549147
Mean R-squared: 0.8019356602446603


GradientBoostingRegressor Cross-Validation Results:
Mean MSE: 23834843.821994103
Mean R-squared: 0.8519114013089842


Test Set Results:
Random Forest Regressor:
MSE: 20905636.109176483
R-squared: 0.8914136905109725

Linear Regression:
MSE: 35402353.31978266
R-squared: 0.8161160524297842

Gradient Boosting Regressor:
MSE: 21853767.570898447
R-squared: 0.8864889852400512


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Assuming you already have X_train, y_train, X_test, y_test defined

# Initialize the models
random_forest_model = RandomForestRegressor(random_state=42)
linear_regression_model = LinearRegression()
gradient_boosting_model = GradientBoostingRegressor(random_state=42)

# List of models
models = [random_forest_model, linear_regression_model, gradient_boosting_model]

# Cross-validation using 5 folds
cv = 5

# Perform cross-validation and compute MSE, MAE, and R2
for model in models:
    model_name = model.__class__.__name__
    mse_scores = -cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

    print(f"{model_name} Cross-Validation Results:")
    print(f"Mean MSE: {np.mean(mse_scores)}")
    print(f"Mean MAE: {np.mean(np.abs(mae_scores))}")  # Calculate MAE from absolute values
    print(f"Mean R-squared: {np.mean(r2_scores)}")
    print("\n")

# Fit the models on the entire training data
random_forest_model.fit(X_train, y_train)
linear_regression_model.fit(X_train, y_train)
gradient_boosting_model.fit(X_train, y_train)

# Make predictions on the test data
rf_predictions = random_forest_model.predict(X_test)
lr_predictions = linear_regression_model.predict(X_test)
gb_predictions = gradient_boosting_model.predict(X_test)

# Evaluate the models on the test data
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

lr_mse = mean_squared_error(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

gb_mse = mean_squared_error(y_test, gb_predictions)
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

# Display the test set results
print("Test Set Results:")
print("Random Forest Regressor:")
print(f"MSE: {rf_mse}")
print(f"MAE: {rf_mae}")
print(f"R-squared: {rf_r2}")

print("\nLinear Regression:")
print(f"MSE: {lr_mse}")
print(f"MAE: {lr_mae}")
print(f"R-squared: {lr_r2}")

print("\nGradient Boosting Regressor:")
print(f"MSE: {gb_mse}")
print(f"MAE: {gb_mae}")
print(f"R-squared: {gb_r2}")


RandomForestRegressor Cross-Validation Results:
Mean MSE: 24487188.924143057
Mean MAE: 3530.9998569206846
Mean R-squared: 0.8454009412230146


LinearRegression Cross-Validation Results:
Mean MSE: 31732657.64549147
Mean MAE: 3932.120537704065
Mean R-squared: 0.8019356602446603


GradientBoostingRegressor Cross-Validation Results:
Mean MSE: 23834843.821994103
Mean MAE: 3520.8177280862765
Mean R-squared: 0.8519114013089842


Test Set Results:
Random Forest Regressor:
MSE: 20905636.109176483
MAE: 3219.8473407890647
R-squared: 0.8914136905109725

Linear Regression:
MSE: 35402353.31978266
MAE: 4154.953408209347
R-squared: 0.8161160524297842

Gradient Boosting Regressor:
MSE: 21853767.570898447
MAE: 3345.986172933186
R-squared: 0.8864889852400512
