In [1]:
import pandas as pd
import random
import numpy as np
import time
import math
import sys
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import datatable as dt

from xgboost import plot_tree
from os import walk
from os import listdir
from os.path import isfile, join
from scipy.spatial.distance import euclidean, pdist, squareform
from scipy.stats import skew
from scipy.special import expit as sigmoid
from scipy.cluster.hierarchy import fclusterdata
from pandas.plotting import autocorrelation_plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
ny_cols = ["last_name", "first_name", "middle_name", "name_suffix", "house_number", "house_fractional_addr", "residence_apartment", "residence_pre_street_direction", "residence_street_name", "residence_post_street_direction", "residence_city", "residence_zip_code_5", "residence_zip_code_4", "mail_addr1", "mail_addr2", "mail_addr3", "mail_addr4", "dob", "gender", "political_party", "other_party", "county_code", "election_district", "legislative_district", "town_city", "ward", "congressional_district", "senate_district", "assembly_district", "last_date_voted", "last_year_voted", "last_county_voted", "last_registered_address", "last_registered_name", "county_voter_registration_no", "application_date", "application_source", "identification_required_flag", "identification_verification_requirement_met_flag", "voter_status_codes", "status_reason_codes", "inactive_voter_date", "purge_voter_date", "unique_nys_voter_id", "voter_history"]


In [3]:
selective_headers = [
    'first_name',
    'last_name',
    'dob',
    'county_code',
    'house_number',
    'residence_apartment',
    'residence_street_name',
    'residence_city',
    'residence_zip_code_5',
    'gender',
    'unique_nys_voter_id',
    'political_party',
    'voter_status_codes'
]

# precinct and race was not found

In [4]:
new_york_path = "data/NewYork"

In [5]:
COUNTY=sys.argv[1]

In [6]:
date_str = sys.argv[2]

## Uncomment below lines and add parameters manually when exploring through notebook instead of python-script

In [37]:
# date_str = "20121231"
# COUNTY = "31"

In [9]:
import os
if not os.path.exists(new_york_path + "/couples/" + date_str):
    os.makedirs(new_york_path + "/couples/" + date_str)

In [10]:
source_county_file_name = "county_" + date_str + "_" + COUNTY + ".csv"

In [11]:
source_county_file_name

'county_20121231_31.csv'

In [12]:
COUPLES_SAVED_PATH = new_york_path + "/couples/" + date_str + "/" + "couples_" + date_str + "_" + COUNTY + ".csv"

In [13]:
COUPLES_SAVED_PATH

'data/NewYork/couples/20121231/couples_20121231_31.csv'

In [14]:
global_df = pd.read_csv(new_york_path + "/" + date_str + "_county_files/" + source_county_file_name, sep="\t",  encoding='iso-8859-1')

In [15]:
global_df.head()

Unnamed: 0,last_name,first_name,house_number,residence_apartment,residence_street_name,residence_city,residence_zip_code_5,dob,gender,political_party,county_code,voter_status_codes,unique_nys_voter_id,age,uniq_addr
0,krieger,nicole,405,14m,east 54 street,manhattan,10022.0,19710223,F,DEM,31,ACTIVE,NY000000000034125728,41,405 14m east 54 street manhattan 100220
1,law,yuet kwai,80,3a,beekman street,new york,10038.0,19650424,F,BLK,31,ACTIVE,NY000000000034125731,47,80 3a beekman street new york 100380
2,mody,sushama,500,apt 1410,west 56 street,new york,10019.0,19741001,M,DEM,31,ACTIVE,NY000000000034125809,38,500 apt 1410 west 56 street new york 100190
3,barrera,john,237,1,west 11 street,manhattan,10014.0,19720228,M,DEM,31,ACTIVE,NY000000000034125923,40,237 1 west 11 street manhattan 100140
4,giordano,mary,225,9c,rector place,manhattan,10280.0,19600407,F,REP,31,ACTIVE,NY000000000034126270,52,225 9c rector place manhattan 102800


In [16]:
global_df_copy = global_df.copy(deep=True)

In [17]:
merge = pd.merge(global_df, global_df_copy, on=["uniq_addr"], suffixes=["_L", "_R"])

In [18]:
merge = merge[merge["unique_nys_voter_id_L"] != merge["unique_nys_voter_id_R"]]

In [19]:
merge.shape

(1481180, 29)

In [20]:
filtered = merge[merge["unique_nys_voter_id_L"] < merge["unique_nys_voter_id_R"]]

In [21]:
filtered.shape

(740590, 29)

In [22]:
def modified_couple_heuristic(row):
    male_age_threshold = 27
    female_age_threshold = 25
    unknown_age_threshold = 26
    age_diff_threshold = 15
    
    age_diff = abs(row['age_L'] - row['age_R'])
    
    is_age_threshold_L = False
    if row["gender_L"] == "M" and row["age_L"] >= male_age_threshold: 
        is_age_threshold_L = True
    elif row["gender_L"] == "F" and row["age_L"] >= female_age_threshold:
        is_age_threshold_L = True
    elif row["gender_L"] == "U" and row["age_L"] >= unknown_age_threshold:
        is_age_threshold_L = True

    
        
    is_age_threshold_R = False
    if row["gender_R"] == "M" and row["age_R"] >= male_age_threshold: 
        is_age_threshold_R = True
    elif row["gender_R"] == "F" and row["age_R"] >= female_age_threshold:
        is_age_threshold_R = True
    elif row["gender_R"] == "U" and row["age_R"] >= unknown_age_threshold:
        is_age_threshold_R = True
    
    return is_age_threshold_L and is_age_threshold_R and age_diff <= age_diff_threshold
        

In [23]:
filtered.columns

Index(['last_name_L', 'first_name_L', 'house_number_L',
       'residence_apartment_L', 'residence_street_name_L', 'residence_city_L',
       'residence_zip_code_5_L', 'dob_L', 'gender_L', 'political_party_L',
       'county_code_L', 'voter_status_codes_L', 'unique_nys_voter_id_L',
       'age_L', 'uniq_addr', 'last_name_R', 'first_name_R', 'house_number_R',
       'residence_apartment_R', 'residence_street_name_R', 'residence_city_R',
       'residence_zip_code_5_R', 'dob_R', 'gender_R', 'political_party_R',
       'county_code_R', 'voter_status_codes_R', 'unique_nys_voter_id_R',
       'age_R'],
      dtype='object')

In [24]:
filtered.shape

(740590, 29)

In [25]:
couples = filtered[filtered.apply(modified_couple_heuristic, axis=1)]

In [26]:
couples.shape

(375416, 29)

In [27]:
couples.head()

Unnamed: 0,last_name_L,first_name_L,house_number_L,residence_apartment_L,residence_street_name_L,residence_city_L,residence_zip_code_5_L,dob_L,gender_L,political_party_L,...,residence_street_name_R,residence_city_R,residence_zip_code_5_R,dob_R,gender_R,political_party_R,county_code_R,voter_status_codes_R,unique_nys_voter_id_R,age_R
10,pastrano,jason,70,24f,little west street,manhattan,10280.0,19720329,M,BLK,...,little west street,manhattan,10280.0,19700106,F,DEM,31,ACTIVE,NY000000000038236817,42
16,molluso,may,211,5j,central park west,manhattan,10024.0,19731124,F,BLK,...,central park west,manhattan,10024.0,19690224,M,REP,31,ACTIVE,NY000000000037997015,43
26,lynch,carlinda,860,17b,columbus avenue,manhattan,10025.0,19511114,F,REP,...,columbus avenue,manhattan,10025.0,19480315,F,DEM,31,ACTIVE,NY000000000037608370,64
36,eng,han,1619,7-c,3 avenue,manhattan,10128.0,19380928,F,REP,...,3 avenue,manhattan,10128.0,19500828,F,DEM,31,PURGED,NY000000000037673647,62
44,deblois,brian,75,10d,west street,manhattan,10006.0,19710526,M,DEM,...,west street,manhattan,10006.0,19731204,M,BLK,31,ACTIVE,NY000000000035440396,39


In [28]:
global_df.shape

(1303036, 15)

In [29]:
couples["age_diff"] = couples.apply(lambda row: abs(row["age_L"] - row["age_R"]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [30]:
sorted_couples = couples.sort_values(by="age_diff")

In [31]:
single_house_couples = sorted_couples.drop_duplicates(subset="uniq_addr", keep="first")

In [32]:
single_house_couples.shape

(213464, 30)

In [33]:
global_df.shape

(1303036, 15)

In [34]:
COUPLES_SAVED_PATH

'data/NewYork/couples/20121231/couples_20121231_31.csv'

In [35]:
single_house_couples.to_csv(COUPLES_SAVED_PATH, sep="\t", index=False)

In [39]:
print("Done processing " + COUPLES_SAVED_PATH)

Done processing data/NewYork/couples/20121231/couples_20121231_31.csv
