In [243]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
from IPython.display import display
from sklearn.metrics import r2_score

In [163]:
# get our initial data
immig = pd.read_csv('Datasets/ImmigrationByCountryAndYear.csv').transpose()
immig_regions = immig.iloc[0]
immig = immig[1:]
immig = immig.rename(columns = immig_regions)
immig = immig[::-1]
display(immig)

Unnamed: 0,Total,Reported by region and/or country,Europe,Northern and Western Europe,Northern Europe,British Isles,United Kingdom (total),Great Britain (total),England,Scotland,...,South America,Northern America,Canada,Canada-French,Canada-Other,Newfoundland,Other Northern America,Region or country not reported,Born at sea,Not reported
1850,2244602,2202625,2031867,2022195,1358887,1340812,(X),379093,278675,70550,...,1543,147711,147711,(NA),(NA),(NA),(NA),41977,(NA),41977
1860,4138697,4134809,3807062,3773347,2271661,2199079,(X),587775,431692,108518,...,3263,249970,249970,(NA),(NA),(NA),(NA),3888,2522,1366
1870,5567229,5563637,4941049,4845679,2867926,2626241,(X),770414,550924,140835,...,3565,493467,493464,(NA),(NA),(NA),3,3592,2638,954
1880,6679943,6675875,5751823,5499889,3212431,2772169,(X),917598,662676,170136,...,4566,717286,717157,(NA),(NA),(NA),129,4068,4068,(NA)
1890,9249547,9243535,8030347,7288917,4056160,3122911,(X),1251402,908141,242231,...,5006,980938,980938,302496,678442,(NA),(NA),6012,5533,479
1900,10341276,10330534,8881548,7204649,3917815,2783082,(X),1167623,840513,233524,...,4733,1179922,1179922,395126,784796,(NA),(NA),10742,8196,2546
1910,13515886,13506272,11810115,7306325,3953947,2573534,(X),1221283,877719,261076,...,8228,1209717,1209717,385083,819554,5080,(NA),9614,6927,2687
1920,13920692,13911767,11916048,6241916,3501149,2172723,(X),1135489,813853,254570,...,18551,1138174,1138174,307786,817139,13249,(NA),8925,5336,3589
1950,14204149,14197553,11784010,5850256,3415551,2147733,1402923,1224091,809563,354323,...,33623,1310369,1310369,370852,915537,23980,(NA),6596,5008,1588
1960,9738091,9678201,7256311,3334971,1694430,1171777,833055,764893,528205,213219,...,89536,952500,952500,(NA),(NA),(NA),(NA),59890,(NA),(NA)


In [152]:
# then clean our names
names = pd.read_csv('Datasets/NationalNames.csv', delimiter = ',', usecols = [1, 2, 3, 4])
names = names.groupby(['Name', 'Year'])['Count'].sum()
names = names.unstack(level=0)
display(names.head())

Name,Aaban,Aabha,Aabid,Aabriella,Aadam,Aadan,Aadarsh,Aaden,Aadesh,Aadhav,...,Zyshon,Zyshonne,Zytaevius,Zytaveon,Zytavion,Zytavious,Zyvion,Zyyanna,Zyyon,Zzyzx
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1880,,,,,,,,,,,...,,,,,,,,,,
1881,,,,,,,,,,,...,,,,,,,,,,
1882,,,,,,,,,,,...,,,,,,,,,,
1883,,,,,,,,,,,...,,,,,,,,,,
1884,,,,,,,,,,,...,,,,,,,,,,


In [160]:
# now groupby 10 and at last we have what we want
names_10 = names.groupby([(y / 10) * 10 for y in names.index.values]).sum().fillna(0)
display(names_10)

Name,Aaban,Aabha,Aabid,Aabriella,Aadam,Aadan,Aadarsh,Aaden,Aadesh,Aadhav,...,Zyshon,Zyshonne,Zytaevius,Zytaveon,Zytavion,Zytavious,Zyvion,Zyyanna,Zyyon,Zzyzx
1880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [262]:
# get some useful tools out of the way now
# to check for empty values in immigration
import re
numeric = re.compile("\d+(?:,\d+)?")
# display(s[s.str.match("\d+(?:,\d+)?")])
# to normalize series
def normalize(series):
    max_val = series.max()
    min_val = series.min()
    return (series - min_val) / (max_val - min_val)
def standardize(series):
    return (series - series.mean()) / series.std()

In [259]:
region_dict = {}
# short circuit example:
# for region in immig.columns[:5]:
for region in immig.columns:
    # this takes a while, so just to track progress--
    print 'Calculating ', region, ' :',
    region_vals = immig[region].fillna('(NA)')
    # filter for only immigration years with present values
    region_vals = region_vals[region_vals.str.match("\d+(?:,\d+)?")]
    # find the years both names and immigration have
    # handle string nature of immigration years
    common_years = names_10.index.intersection(region_vals.index.map(int))
    # filter immigration years by common years, handling int/str switch
    region_vals = region_vals[map(str, common_years)]
    # then map back to int, handling commas
    region_vals = region_vals.apply(lambda x: int(x.replace(',', '')))
    # and normalize
    region_vals = normalize(region_vals)
    region_dict[region] = []
    name_num = 0
    # short circuit example:
    # for name in names_10.columns[:10000]:
    for name in names_10.columns:
        # more progress tracking
        if name_num % 5000 == 0:
            print name, '...',
        name_vals = names_10[name]
        name_vals = name_vals[common_years]
        name_vals = normalize(name_vals)
        # handle some empty arrays because 2010 exclusion
        if name_vals.isnull().any():
            score = -10
        else:
            # first value is true--actual immigration data
            # second value is predict--name data we're trying to use as model
            score = r2_score(region_vals, name_vals)
        region_dict[region].append((name, score))
        name_num += 1
    print 'done!' 

 Calculating  Total  : Aaban ... Analysia ... done!
Calculating  Reported by region and/or country  : Aaban ... Analysia ... done!
Calculating  Europe  : Aaban ... Analysia ... done!
Calculating  Northern and Western Europe  : Aaban ... Analysia ... done!
Calculating  Northern Europe  : Aaban ... Analysia ... done!


In [261]:
for region in region_dict:
    name_scores = region_dict[region]
    name_scores = sorted(name_scores, key=lambda x: -x[1])
    name_scores = [(x[0], round(x[1], 2)) for x in name_scores]
    region_dict[region] = name_scores
    print region, ': ', name_scores[:10]

Europe :  [('Artie', 0.87), ('Ardelia', 0.83), ('Annice', 0.79), ('Aleda', 0.76), ('Anice', 0.75), ('Adolphus', 0.73), ('Aggie', 0.72), ('Algie', 0.66), ('Ardie', 0.65), ('Archie', 0.63)]
Northern and Western Europe :  [('Bama', 0.74), ('Augusta', 0.7), ('Ada', 0.62), ('Attie', 0.61), ('Arrie', 0.54), ('Artie', 0.54), ('Addie', 0.53), ('Arch', 0.48), ('Anner', 0.47), ('Albertine', 0.44)]
Total :  [('Adel', 0.91), ('Audrie', 0.89), ('Amiel', 0.89), ('Aubry', 0.89), ('Aldo', 0.88), ('Ashby', 0.88), ('Aurora', 0.87), ('Adelina', 0.87), ('Abraham', 0.86), ('Annalee', 0.86)]
Reported by region and/or country :  [('Audrie', 0.9), ('Aurora', 0.89), ('Amiel', 0.88), ('Adel', 0.88), ('Aubry', 0.88), ('Aldo', 0.87), ('Adelina', 0.87), ('Baker', 0.86), ('Annabel', 0.86), ('Annia', 0.86)]
Northern Europe :  [('Bama', 0.7), ('Augusta', 0.67), ('Ada', 0.59), ('Attie', 0.59), ('Artie', 0.52), ('Arrie', 0.52), ('Arch', 0.51), ('Addie', 0.5), ('Anner', 0.45), ('Adda', 0.43)]
