In [1]:
import pandas as pd
import random
import numpy as np
import time
import math
import sys
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import datatable as dt

from tqdm import tqdm
from xgboost import plot_tree
from os import walk
from os import listdir
from os.path import isfile, join
from scipy.spatial.distance import euclidean, pdist, squareform
from scipy.stats import skew
from scipy.special import expit as sigmoid
from scipy.cluster.hierarchy import fclusterdata
from pandas.plotting import autocorrelation_plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
ny_cols = ["last_name", "first_name", "middle_name", "name_suffix", "house_number", "house_fractional_addr", "residence_apartment", "residence_pre_street_direction", "residence_street_name", "residence_post_street_direction", "residence_city", "residence_zip_code_5", "residence_zip_code_4", "mail_addr1", "mail_addr2", "mail_addr3", "mail_addr4", "dob", "gender", "political_party", "other_party", "county_code", "election_district", "legislative_district", "town_city", "ward", "congressional_district", "senate_district", "assembly_district", "last_date_voted", "last_year_voted", "last_county_voted", "last_registered_address", "last_registered_name", "county_voter_registration_no", "application_date", "application_source", "identification_required_flag", "identification_verification_requirement_met_flag", "voter_status_codes", "status_reason_codes", "inactive_voter_date", "purge_voter_date", "unique_nys_voter_id", "voter_history"]


In [3]:
selective_headers = [
    'first_name',
    'last_name',
    'dob',
    'county_code',
    'house_number',
    'residence_apartment',
    'residence_street_name',
    'residence_city',
    'residence_zip_code_5',
    'gender',
    'unique_nys_voter_id',
    'political_party',
    'voter_status_codes'
]

# precinct and race was not found

In [4]:
new_york_path = "data/NewYork"

In [5]:
global_df = pd.read_csv(new_york_path + "/AllNYSVoters.txt", encoding='iso-8859-1', names=ny_cols, usecols=selective_headers)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
global_df.isnull().sum()

last_name                    203
first_name                    80
house_number               30704
residence_apartment      9512120
residence_street_name       1346
residence_city                 0
residence_zip_code_5          16
dob                            0
gender                         0
political_party                0
county_code                    0
voter_status_codes             0
unique_nys_voter_id            0
dtype: int64

In [7]:
global_df = global_df.dropna()

In [8]:
global_df.dtypes

last_name                 object
first_name                object
house_number              object
residence_apartment       object
residence_street_name     object
residence_city            object
residence_zip_code_5     float64
dob                        int64
gender                    object
political_party           object
county_code                int64
voter_status_codes        object
unique_nys_voter_id       object
dtype: object

In [9]:
str_cols_lower = [
    'last_name',
    'first_name',
    'house_number',
    'residence_apartment',
    'residence_street_name',
    'residence_city'
]

str_cols_upper = [
    'gender',
    'political_party',
    'voter_status_codes'
]

In [10]:
for col in str_cols_lower:
    global_df[col] = global_df[col].apply(lambda x: str(x).strip().lower())
for col in str_cols_upper:
    global_df[col] = global_df[col].apply(lambda x: str(x).strip().upper())

In [11]:
from datetime import datetime
from datetime import date

# This is different from Florida Heuristic
def calculate_age(born):
    born = datetime.strptime(str(born), "%Y%m%d").date()
    today = datetime.strptime("20121231", "%Y%m%d").date()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [12]:
global_df['age'] = global_df['dob'].apply(calculate_age)

In [13]:
global_df['uniq_addr'] = global_df[['house_number', 'residence_apartment', 'residence_street_name', 'residence_city', 'residence_zip_code_5']].apply(lambda x: ' '.join([str(y) for y in x]), axis=1)

In [14]:
import re
global_df['uniq_addr'] = global_df['uniq_addr'].apply(lambda x: re.sub("[^0-9a-zA-Z\s]+", '', x))

In [15]:
global_df['uniq_addr'] = global_df['uniq_addr'].apply(lambda x: x.strip())

In [17]:
global_df.shape

(5458530, 15)

In [18]:
county_codes = list(global_df["county_code"].unique())

## Splitting data into mulitple partitions based on county codes

In [20]:
import os

if not os.path.exists(new_york_path + "/20121231_county_files"):
    os.makedirs(new_york_path + "/20121231_county_files")

In [21]:
# print("Total counties to process: " + str(len(county_codes)))
for code in tqdm(county_codes):
    county_df = global_df[global_df['county_code'] == code]
    file_path = new_york_path + "/20121231_county_files/county_" + "20121231"  + "_" + str(code) + ".csv"
    county_df.to_csv(file_path, sep="\t", index=False)

100%|██████████| 62/62 [01:01<00:00,  1.01it/s]


In [22]:
global_df[["dob", "age"]].head()

Unnamed: 0,dob,age
8,19591230,53
17,19440226,68
19,19440206,68
21,19870214,25
25,19800521,32
