In [2]:
import pandas as pd
import random
import numpy as np
import math
import sys
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import xgboost as xgb

from xgboost import plot_tree
from os import walk
from os import listdir
from os.path import isfile, join
from scipy.spatial.distance import euclidean, pdist, squareform
from scipy.stats import skew
from scipy.special import expit as sigmoid
from scipy.cluster.hierarchy import fclusterdata
from pandas.plotting import autocorrelation_plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install datatable



In [3]:
import datatable as dt

In [4]:
columns = ['last_name','first_name','middle_name','name_suffix','house_number','house_fractional_addr','residence_apartment','residence_pre_street_direction','residence_street_name','residence_post_street_direction','residence_city','residence_zip_code_5','residence_zip_code_4','mail_addr1','mail_addr2','mail_addr3','mail_addr4','dob','gender','political_party','other_party','county_code','election_district','legislative_district','town_city','ward','congressional_district','senate_district','assembly_district','last_date_voted','last_year_voted','last_county_voted','last_registered_address','last_registered_name','county_voter_registration_no','application_date','application_source','identification_required_flag','identification_verification_requirement_met_flag','voter_status_codes','status_reason_codes','inactive_voter_date','purge_voter_date','unique_nys_voter_id','voter_history']
selective_columns = columns[0:7] + columns[8:9] + columns[10:12] + columns[17:22] + columns[29:35] + columns[40:41] + columns[44:45]
# df = dt.fread('/content/drive/Shared drives/Political Polarization/data/AllNYSVoters_2012/AllNYSVoters.txt', columns=columns)

In [5]:
registration_file_headers = [
    'county_code',
    'voter_id',
    'last_name',
    'suffix',
    'first_name',
    'middle_name',
    'requested_public_records_exemption',
    'residence_addr_line_1',
    'residence_addr_line_2',
    'residence_city',
    'residence_state',
    'residence_zipcode',
    'mail_addr_line_1',
    'mail_addr_line_2',
    'mail_addr_line_3',
    'mail_city',
    'mail_state',
    'mail_zipcode',
    'mail_country',
    'gender',
    'race',
    'birth_date',
    'registration_date',
    'party_affiliation',
    'precinct',
    'precinct_group',
    'precinct_split',
    'precinct_suffix',
    'voter_status',
    'congressional_district',
    'house_district',
    'senate_district',
    'county_commission_district',
    'school_board_district',
    'daytime_area_code',
    'daytime_phone_no',
    'daytime_phone_extension',
    'email_address'
]
selective_headers = [
    'county_code',
    'voter_id',
    'last_name',
    'first_name',
    'middle_name',
    'residence_addr_line_1',
    'residence_addr_line_2',
    'residence_city',
    'residence_state',
    'residence_zipcode',
    'gender',
    'race',
    'birth_date',
    'registration_date',
    'party_affiliation',
    'voter_status',
    'email_address'
]

In [8]:
df = pd.read_csv('~/Windows/Documents/CSE519/project/data/Florida_201908/20190813_VoterDetail/BRO_20190813.txt', sep='\t', header=0, 
                        names=registration_file_headers, usecols=selective_headers, encoding='ISO-8859-1')

In [9]:
df.shape

(1259869, 17)

In [10]:
df.isna().sum()

county_code                   0
voter_id                      0
last_name                     0
first_name                    0
middle_name              310047
residence_addr_line_1         0
residence_addr_line_2         0
residence_city                3
residence_state               0
residence_zipcode             0
gender                       41
race                          0
birth_date                 4678
registration_date             0
party_affiliation             0
voter_status                  0
email_address                 0
dtype: int64

In [11]:
req_cols = ['residence_city', 'gender', 'birth_date']
df = df.dropna(subset=req_cols)

In [12]:
df.isna().sum()

county_code                   0
voter_id                      0
last_name                     0
first_name                    0
middle_name              310026
residence_addr_line_1         0
residence_addr_line_2         0
residence_city                0
residence_state               0
residence_zipcode             0
gender                        0
race                          0
birth_date                    0
registration_date             0
party_affiliation             0
voter_status                  0
email_address                 0
dtype: int64

In [13]:
df.shape

(1255147, 17)

In [14]:
from datetime import datetime
from datetime import date

def calculate_age(born):
  born = datetime.strptime(str(born), "%m/%d/%Y").date()
  today = date.today()
  return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

df['age'] = df['birth_date'].apply(calculate_age)

In [15]:
df.head()[['age', 'birth_date']]

Unnamed: 0,age,birth_date
0,68,11/07/1951
1,34,12/17/1984
2,44,10/22/1975
3,67,08/02/1952
4,48,05/31/1971


In [16]:
df.head()

Unnamed: 0,county_code,voter_id,last_name,first_name,middle_name,residence_addr_line_1,residence_addr_line_2,residence_city,residence_state,residence_zipcode,gender,race,birth_date,registration_date,party_affiliation,voter_status,email_address,age
0,BRO,101394357,Cruz,Stephen,Anthony,200 NW 43rd Ave,,Coconut Creek,,330661702,M,5,11/07/1951,04/18/1984,NPA,ACT,,68
1,BRO,117596571,Hossain,Danielle,Ann,8040 NW 96Th Ter,APT 107,Tamarac,,333211357,F,5,12/17/1984,10/11/2010,DEM,ACT,,34
2,BRO,117702085,Ulysse,Marline,,3000 SW 64Th Ter,,Miramar,,33023,F,3,10/22/1975,09/27/2010,DEM,ACT,,44
3,BRO,101356402,Ward-Jones,Patricia,Ann,4921 NW 17th St,,Lauderhill,,33313,F,3,08/02/1952,03/27/1982,DEM,ACT,,67
4,BRO,118484365,Drouncheck,Eric,,712 SW 158th Ter,,Pembroke Pines,,330275000,M,5,05/31/1971,10/11/2010,REP,ACT,,48


In [17]:
str_cols_lower = [
    'last_name',
    'first_name',
    'middle_name',
    'residence_addr_line_1',
    'residence_addr_line_2',
    'residence_city',
    'residence_state',
    'email_address'
]
str_cols_upper = [
    'county_code',
    'gender',
    'party_affiliation',
    'voter_status'
]

In [18]:
for col in str_cols_lower:
  df[col] = df[col].apply(lambda x: str(x).strip().lower())
for col in str_cols_upper:
  df[col] = df[col].apply(lambda x: str(x).strip().upper())

In [19]:
df['residence_zipcode_5'] = df['residence_zipcode'].apply(lambda x: int(str(x)[:5]))

In [20]:
def generate_zipcode_4(zip):
  zip = str(zip)
  if len(zip) > 5:
    return int(zip[-4:])
  else:
    return ''

df['residence_zipcode_4'] = df['residence_zipcode'].apply(generate_zipcode_4)

In [21]:
df['uniq_addr'] = df[['residence_addr_line_1', 'residence_addr_line_2', 'residence_city']].apply(lambda x: ' '.join(x), axis=1)

In [22]:
df.head()

Unnamed: 0,county_code,voter_id,last_name,first_name,middle_name,residence_addr_line_1,residence_addr_line_2,residence_city,residence_state,residence_zipcode,...,race,birth_date,registration_date,party_affiliation,voter_status,email_address,age,residence_zipcode_5,residence_zipcode_4,uniq_addr
0,BRO,101394357,cruz,stephen,anthony,200 nw 43rd ave,,coconut creek,,330661702,...,5,11/07/1951,04/18/1984,NPA,ACT,,68,33066,1702.0,200 nw 43rd ave coconut creek
1,BRO,117596571,hossain,danielle,ann,8040 nw 96th ter,apt 107,tamarac,,333211357,...,5,12/17/1984,10/11/2010,DEM,ACT,,34,33321,1357.0,8040 nw 96th ter apt 107 tamarac
2,BRO,117702085,ulysse,marline,,3000 sw 64th ter,,miramar,,33023,...,3,10/22/1975,09/27/2010,DEM,ACT,,44,33023,,3000 sw 64th ter miramar
3,BRO,101356402,ward-jones,patricia,ann,4921 nw 17th st,,lauderhill,,33313,...,3,08/02/1952,03/27/1982,DEM,ACT,,67,33313,,4921 nw 17th st lauderhill
4,BRO,118484365,drouncheck,eric,,712 sw 158th ter,,pembroke pines,,330275000,...,5,05/31/1971,10/11/2010,REP,ACT,,48,33027,5000.0,712 sw 158th ter pembroke pines


In [None]:
last_name_counts = df.last_name.value_counts().to_dict()
last_names = set([name for name in last_name_counts if last_name_counts[name] > 1])

In [36]:
gb = df.groupby(['uniq_addr', 'last_name'])

In [64]:
(gb['first_name'].count() > 1).value_counts()

False    734576
True     218614
Name: first_name, dtype: int64