In [6]:
!pip install hopsworks

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hopsworks
  Using cached hopsworks-3.0.5-py3-none-any.whl
Installing collected packages: hopsworks
Successfully installed hopsworks-3.0.5


In [8]:
import pandas as pd
import numpy as np
import hopsworks

In [9]:
project = hopsworks.login(api_key_value='<enter_your_hopsworks_api_key>')
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/44182
Connected. Call `.close()` to terminate connection gracefully.


The data used for this project were obtained from the North Carolina State Board of Elections (NCSBE). Both voter registration records and voter history records are made available and updated weekly. The data is included along with the code

In [24]:
# This function takes in the state code as parameter and returns the region
# to which it belongs.
def states_auxiliary_function(state):

    # check if U.S. territory or out of country
    if state in ['AS', 'GU', 'MP', 'PR', 'VI', 'OC']:
        return 'Others'

    # the rest of the categories are based on U.S. Census Bureau regions
    elif state in ['CT', 'ME', 'MA', 'NH', 'RI', 'VT',
                   'NJ', 'NY', 'PA']:
        return 'Northeast'

    elif state in ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA',
                   'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR',
                   'LA', 'OK', 'TX']:
        return 'South'

    elif state in ['IL', 'IN', 'MI', 'OH', 'WI',
                   'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']:
        return 'Midwest'

    elif state in ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT',
                   'WY', 'AK', 'CA', 'HI', 'OR', 'WA']:
        return 'West'

    else:
        return 'Missing'

# This function takes in party_name as parameter and performs binning to
# bundle the least represented parties
def parties_auxiliary_function(party_name):
    if party_name == "REP":
        return "REP"
    elif party_name == "DEM":
        return "DEM"
    else:
        return "Others"

class CategoricalVariableHandling:
    def __init__(self) -> None:
        self.df = pd.read_csv("UC_gen_2020")

    def perform_binning_for_states(self):
        self.df['State Region'] = self.df['birth_state'].apply(
            states_auxiliary_function)
    
    def perform_binning_for_parties(self):
        self.df['Political Party'] = self.df['party_cd'].apply(
            parties_auxiliary_function)
    
    # The age is calculated by subtracting 2020 from the birth_year
    # 2020 was the election year
    def calculate_age(self):
        self.df['Age'] = 2020 - self.df['birth_year']
    
    def select_final_dataframe(self):
        self.df = self.df[[
            'State Region', 'Political Party', 'Age', 'drivers_lic', 
            'ethnic_code', 'gender_code']]


class PerformAggregation(CategoricalVariableHandling):
    def __init__(self) -> None:
        super().__init__()
        self.perform_binning_for_states()
        self.perform_binning_for_parties()
        self.calculate_age()
        self.select_final_dataframe()
    
    # Performing aggregation (mean, median, min, max and count) over the age column
    def get_aggregated_values(self):
        self.df = self.df.groupby([
            'State Region', 'Political Party', 'gender_code',
            'ethnic_code', 'drivers_lic']).agg(
            Mean_Age=('Age', np.mean),
            Median_Age = ('Age',np.median),
            Min_Age = ('Age',np.min),
            Max_Age = ('Age',np.max),
            Voter_Count = ('Age',np.size)
            ).reset_index()
    
    # Getting the final dataframe to be pushed to the feature store
    def get_final_dataframe(self):
        self.get_aggregated_values()
        self.df['p_key'] = [i for i in range(1,len(self.df)+1)]
        self.df.columns = ['state_region', 'political_party', 'gender_code', 'ethnic_code',
       'drivers_lic', 'mean_age', 'median_age', 'min_age', 'max_age',
       'voter_vount', 'p_key']


perform_aggregation_object = PerformAggregation()
perform_aggregation_object.get_final_dataframe()



Index(['state_region', 'political_party', 'gender_code', 'ethnic_code',
       'drivers_lic', 'mean_age', 'median_age', 'min_age', 'max_age',
       'voter_vount', 'p_key'],
      dtype='object')

In [25]:
# The below code will create a feature group in the feature store based on the metadata provided
voter_fg = fs.get_or_create_feature_group(
    name="voter_fg",
    version=1,
    description="Voter data with categorical variables and aggregation",
    primary_key=['p_key'],
    online_enabled=True
)

In [26]:
# Inserting data to the created feature group
voter_fg.insert(perform_aggregation_object.df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/44182/fs/44101/fg/46580


Uploading Dataframe: 0.00% |          | Rows 0/290 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/44182/jobs/named/voter_fg_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x7f5cc48d6590>, None)

In [27]:
# Updating the Feature Descriptions

feature_descriptions = [
    {"name": "state_region", "description": "It bins the state into Northeast, South, Midwest, West and Others"},
    {"name": "political_party", "description": "It bins the political party into the Democratic, Republic and Others"},
    {"name": "gender_code", "description": "Contains information about the Gender code"},
    {"name": "ethnic_code", "description": "Contains information about the ethnic code"},
    {"name": "drivers_lic", "description": "Contains information regarding whether the voters have a driver's license or not"},
    {"name": "mean_age", "description": "Contains information regarding the mean age of the voter"},
    {"name": "median_age", "description": "Contains information regarding the median age of the voter"},
    {"name": "min_age", "description": "Contains information regarding the minimum age of the voter"},
    {"name": "max_age", "description": "Contains information regarding the maximum age of the voter"},
    {"name": "voter_vount", "description": "Contains information regarding the number of voters"},
    {"name": "p_key", "description": "This feature is used as a primary key"},    
]

for desciption in feature_descriptions: 
    voter_fg.update_feature_description(desciption["name"], desciption["description"])