##  Folktables - A replacement for Adult ACS dataset 
### and an easy way to import ACS data

Code from Folktables Repo

References: 
 - [Retiring Adult](https://openreview.net/forum?id=bYi_2708mKK)
 - [Folktables on Github](https://github.com/socialfoundations/folktables)
 - [Folktable Datasheet](https://github.com/socialfoundations/folktables/blob/main/datasheet.md)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
try:
    import folktables
except ImportError:
    !pip install folktables
    import folktables
    from folktables import ACSDataSource, ACSEmployment


### Arguments for importing data from folktables 
- survey_year: str, optional
 --    The year of the survey data to download. Default is '2018'.
 -  horizon: str, optional
 -- The time horizon of the survey data to download. Default is '1-Year'.
 -  survey: str, optional
-- The type of survey data to download. Default is 'person'.
- states: list, optional
-- A list of state abbreviations to download data for. Default is None.
-- If None, data for all states will be downloaded.
- download: bool, optional
-- Whether to download the data. Default is True. 

In [None]:
# Alabama, 2018, one year estimates, person survey, Employment status

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["AL"], download=True)
features, label, group = ACSEmployment.df_to_numpy(acs_data)

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features, label, group, test_size=0.2, random_state=0)

###### Your favorite learning algorithm here #####
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

yhat = model.predict(X_test)

white_tpr = np.mean(yhat[(y_test == 1) & (group_test == 1)])
black_tpr = np.mean(yhat[(y_test == 1) & (group_test == 2)])

# Equality of opportunity violation: 0.0871
diff_tpr = white_tpr - black_tpr
print('The white True Positive Rate is ',white_tpr)
print('The black True Positive Rate is ',black_tpr)
print('The difference in True Positive Rates is ',diff_tpr)

In [None]:
# can we print a confusion matrix for each  group?

from sklearn.metrics import confusion_matrix

white_confusion_matrix = confusion_matrix(y_test[(group_test == 1)], yhat[(group_test == 1)])
print("Confusion Matrix for the White Group:")
print(white_confusion_matrix)

black_confusion_matrix = confusion_matrix(y_test[(group_test == 2)], yhat[(group_test == 2)])
print("Confusion Matrix for the Black Group:")
print(black_confusion_matrix)


In [None]:
def print_confusion_matrix_percentages(y_true, y_pred, group_name, decimals=2):
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm / cm.sum(axis=1, keepdims=True) * 100  # Normalize by row sums
    cm_rounded = np.round(cm_percentage, decimals)  # Round the percentages
    print(f"Confusion Matrix for the {group_name} Group (Percentages):")
    print(cm_rounded)
    print("Labels: [Negative, Positive]")

In [None]:
print_confusion_matrix_percentages(y_test[(group_test == 1)], yhat[(group_test == 1)], "White")

print_confusion_matrix_percentages(y_test[(group_test == 2)], yhat[(group_test == 2)], "Black")


In [None]:
acs_tx = data_source.get_data(states=["TX"], download=True)
tx_features, tx_label, tx_group = ACSEmployment.df_to_numpy(acs_tx)

features, label, group = ACSEmployment.df_to_numpy(acs_tx)
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    tx_features, tx_label, tx_group, test_size=0.2, random_state=0)

model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

yhat = model.predict(X_test)
white_tpr = np.mean(yhat[(y_test == 1) & (group_test == 1)])
black_tpr = np.mean(yhat[(y_test == 1) & (group_test == 2)])

# Equality of opportunity violation: 0.0397
white_tpr - black_tpr

In [None]:
from folktables import ACSDataSource, ACSIncome

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)
mi_data = data_source.get_data(states=["MI"], download=True)
ca_features, ca_labels, _ = ACSIncome.df_to_numpy(ca_data)
mi_features, mi_labels, _ = ACSIncome.df_to_numpy(mi_data)



In [None]:
# Standardize the features and train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

# Train on CA data
model.fit(ca_features, ca_labels)

# Test on MI data
model.score(mi_features, mi_labels)

##  Comparing Different Years 
### Train on 2014 and ruun on succeeding Years

In [None]:
from folktables import ACSDataSource, ACSPublicCoverage

# Download 2014 data
data_source = ACSDataSource(survey_year=2014, horizon='1-Year', survey='person')
acs_data14 = data_source.get_data(states=["CA"], download=True)
features14, labels14, _ = ACSPublicCoverage.df_to_numpy(acs_data14)

In [None]:
# Train model on 2014 data
# Plug-in your method for tabular datasets
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(features14, labels14)

# Evaluate model on 2015-2018 data

accuracies = []
for year in [2015, 2016, 2017, 2018]:
    data_source = ACSDataSource(survey_year=year, horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=["CA"], download=True)
    features, labels, _ = ACSPublicCoverage.df_to_numpy(acs_data)
    accuracies.append(model.score(features, labels))


In [None]:
accuracies

## Set up a moel with new parameters
 - Decide on Features
 - Change income threshold to 25K
 - groups to compare are gender
 

In [None]:
ACSIncomeNew = folktables.BasicProblem(
    features=[
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'POBP',
        'RELP',
        'WKHP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
    target_transform=lambda x: x > 25000,    
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

### Run this new model on 2018 data 

In [None]:
# Download 2018 data
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)
features, labels, groups = ACSIncomeNew.df_to_numpy(acs_data)


In [None]:
# Model training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features, labels, groups, test_size=0.2, random_state=0)

In [None]:
# Standardize the features and train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)
# Evaluate the model
model.score(X_test, y_test)
# Evaluate the model for each group
accuracies = []
for group in np.unique(groups):
    mask = groups == group
    accuracies.append(model.score(features[mask], labels[mask]))
print(accuracies)

Difference = accuracies[0] - accuracies[1]
Difference