In [17]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
from IPython.display import display
from sklearn.metrics import r2_score

In [26]:
# get our initial data
immig = pd.read_csv('Datasets/ImmigrationByRegion.csv').transpose()
immig_regions = immig.iloc[0]
immig = immig[1:]
immig = immig.rename(columns = immig_regions)
immig = immig[::-1]
display(immig)

Unnamed: 0,Europe,Asia,Africa,Oceania,Americas
1850,2031867,1135,551,588,168484
1860,3807062,36796,526,2140,288285
1870,4941049,64565,2657,4028,551338
1880,5751823,107630,2204,6859,807359
1890,8030347,113383,2207,9353,1088245
1900,8881548,120248,2538,8820,1317380
1910,11810115,191484,3992,11450,1489231
1920,11916048,237950,16126,14626,1727017
1950,11784010,275665,18326,17343,2102209
1960,7256311,490996,35355,34730,1860809


In [27]:
# then clean our names
names = pd.read_csv('Datasets/NationalNames.csv', delimiter = ',', usecols = [1, 2, 3, 4])
names = names.groupby(['Name', 'Year'])['Count'].sum()
names = names.unstack(level=0)
display(names.head())

Name,Aaban,Aabha,Aabid,Aabriella,Aadam,Aadan,Aadarsh,Aaden,Aadesh,Aadhav,...,Zyshon,Zyshonne,Zytaevius,Zytaveon,Zytavion,Zytavious,Zyvion,Zyyanna,Zyyon,Zzyzx
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1880,,,,,,,,,,,...,,,,,,,,,,
1881,,,,,,,,,,,...,,,,,,,,,,
1882,,,,,,,,,,,...,,,,,,,,,,
1883,,,,,,,,,,,...,,,,,,,,,,
1884,,,,,,,,,,,...,,,,,,,,,,


In [28]:
# now groupby 10 and at last we have what we want
names_10 = names.groupby([(y / 10) * 10 for y in names.index.values]).sum().fillna(0)
display(names_10)

Name,Aaban,Aabha,Aabid,Aabriella,Aadam,Aadan,Aadarsh,Aaden,Aadesh,Aadhav,...,Zyshon,Zyshonne,Zytaevius,Zytaveon,Zytavion,Zytavious,Zyvion,Zyyanna,Zyyon,Zzyzx
1880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# get some useful tools out of the way now
# to check for empty values in immigration
import re
numeric = re.compile("\d+(?:,\d+)?")
# display(s[s.str.match("\d+(?:,\d+)?")])
# to normalize series
def normalize(series):
    max_val = series.max()
    min_val = series.min()
    return (series - min_val) / (max_val - min_val)
def standardize(series):
    return (series - series.mean()) / series.std()




In [35]:
region_dict = {}
# short circuit example:
# for region in immig.columns[:5]:
for region in immig.columns:
    # this takes a while, so just to track progress--
    print 'Calculating ', region, ' :',
    region_vals = immig[region].fillna('(NA)')
    # filter for only immigration years with present values
    #region_vals = region_vals[region_vals.str.match("\d+(?:,\d+)?")]
    # find the years both names and immigration have
    # handle string nature of immigration years
    common_years = names_10.index.intersection(region_vals.index.map(int))
    # filter immigration years by common years, handling int/str switch
    region_vals = region_vals[map(str, common_years)]
    # then map back to int, handling commas
    region_vals = region_vals.apply(lambda x: int(x.replace(',', '')))
    # and normalize
    region_vals = normalize(region_vals)
    region_dict[region] = []
    name_num = 0
    # short circuit example:
    # for name in names_10.columns[:10000]:
    for name in names_10.columns:
        # more progress tracking
        if name_num % 5000 == 0:
            print name, '...',
        name_vals = names_10[name]
        name_vals = name_vals[common_years]
        name_vals = normalize(name_vals)
        # handle some empty arrays because 2010 exclusion
        if name_vals.isnull().any():
            score = -10
        else:
            # first value is true--actual immigration data
            # second value is predict--name data we're trying to use as model
            score = r2_score(region_vals, name_vals)
        region_dict[region].append((name, score))
        name_num += 1
    print 'done!' 

Calculating  Europe  : Aaban ... Analysia ... Barnard ... Cesear ... Daralyn ... Dvir ... Gabryelle ... Isbel ... Jeraldo ... Kaston ... Kynsleigh ... Luceli ... Meera ... Nicholaos ... Raimee ... Saraelizabeth ... Solangel ... Tiffiani ... Willabell ... done!
Calculating  Asia  : Aaban ... Analysia ... Barnard ... Cesear ... Daralyn ... Dvir ... Gabryelle ... Isbel ... Jeraldo ... Kaston ... Kynsleigh ... Luceli ... Meera ... Nicholaos ... Raimee ... Saraelizabeth ... Solangel ... Tiffiani ... Willabell ... done!
Calculating  Africa  : Aaban ... Analysia ... Barnard ... Cesear ... Daralyn ... Dvir ... Gabryelle ... Isbel ... Jeraldo ... Kaston ... Kynsleigh ... Luceli ... Meera ... Nicholaos ... Raimee ... Saraelizabeth ... Solangel ... Tiffiani ... Willabell ... done!
Calculating  Oceania  : Aaban ... Analysia ... Barnard ... Cesear ... Daralyn ... Dvir ... Gabryelle ... Isbel ... Jeraldo ... Kaston ... Kynsleigh ... Luceli ... Meera ... Nicholaos ... Raimee ... Saraelizabeth ... Sol

In [36]:
for region in region_dict:
    name_scores = region_dict[region]
    name_scores = sorted(name_scores, key=lambda x: -x[1])
    name_scores = [(x[0], round(x[1], 2)) for x in name_scores]
    region_dict[region] = name_scores
    print region, ': ', name_scores[:100]

Europe :  [('Roxie', 0.98), ('Leeta', 0.93), ('Lona', 0.93), ('Susie', 0.93), ('Eugenie', 0.93), ('Parthenia', 0.92), ('Sylvania', 0.92), ('Dagmar', 0.92), ('Olinda', 0.91), ('Lesta', 0.89), ('Leita', 0.89), ('Artie', 0.89), ('Monnie', 0.88), ('Lennie', 0.88), ('Leta', 0.88), ('Nona', 0.88), ('Lauretta', 0.87), ('Louvenia', 0.87), ('Leatha', 0.87), ('Retta', 0.86), ('Roe', 0.85), ('Manuelita', 0.85), ('Leda', 0.85), ('Ardelia', 0.85), ('Netta', 0.85), ('Tempie', 0.84), ('Nonie', 0.84), ('Rosena', 0.84), ('Rollie', 0.83), ('Jannie', 0.83), ('Lethia', 0.83), ('Vinnie', 0.83), ('Annice', 0.83), ('Clemmie', 0.83), ('Della', 0.83), ('Fleming', 0.83), ('Belvia', 0.83), ('Rose', 0.82), ('Eppie', 0.82), ('Estella', 0.82), ('Margaretta', 0.82), ('Melvina', 0.82), ('Veta', 0.81), ('Pinkey', 0.81), ('Woodie', 0.81), ('Catharine', 0.81), ('Barney', 0.81), ('Martha', 0.81), ('Lawyer', 0.8), ('Neta', 0.8), ('Anice', 0.8), ('Helene', 0.8), ('Hamp', 0.79), ('Mada', 0.79), ('Adolphus', 0.79), ('Freda',

In [64]:
scores_Europe = region_dict['Europe'][:100]
names_Europe = []
for i in range(100):
    name = scores_Europe[i][0]
    names_Europe.append(name)
print names_Europe

['Roxie', 'Leeta', 'Lona', 'Susie', 'Eugenie', 'Parthenia', 'Sylvania', 'Dagmar', 'Olinda', 'Lesta', 'Leita', 'Artie', 'Monnie', 'Lennie', 'Leta', 'Nona', 'Lauretta', 'Louvenia', 'Leatha', 'Retta', 'Roe', 'Manuelita', 'Leda', 'Ardelia', 'Netta', 'Tempie', 'Nonie', 'Rosena', 'Rollie', 'Jannie', 'Lethia', 'Vinnie', 'Annice', 'Clemmie', 'Della', 'Fleming', 'Belvia', 'Rose', 'Eppie', 'Estella', 'Margaretta', 'Melvina', 'Veta', 'Pinkey', 'Woodie', 'Catharine', 'Barney', 'Martha', 'Lawyer', 'Neta', 'Anice', 'Helene', 'Hamp', 'Mada', 'Adolphus', 'Freda', 'Mose', 'Orelia', 'Irma', 'Aleda', 'Collie', 'Lugenia', 'Zollie', 'Ike', 'Dora', 'Josepha', 'Letha', 'Thyra', 'Luvenia', 'Mary', 'Percy', 'Sallye', 'Margart', 'Reta', 'Trudie', 'Willie', 'Berta', 'Cornelia', 'Electa', 'Marquerite', 'Robley', 'Burney', 'Ebbie', 'Hosie', 'Enola', 'Zona', 'Claude', 'Nonnie', 'Rella', 'Jule', 'Pearlie', 'Rosetta', 'Theodosia', 'Norine', 'Aggie', 'Pinkie', 'Leonia', 'Floretta', 'Rowena', 'Freida']


In [67]:
scores_Oceania = region_dict['Oceania'][:100]
names_Oceania = []
for i in range(100):
    name = scores_Oceania[i][0]
    names_Oceania.append(name)
print names_Oceania

['Sloan', 'Weston', 'Solange', 'Boone', 'Kiyomi', 'Arisha', 'Thorin', 'Uriah', 'Farida', 'Raylon', 'Levi', 'Jabir', 'Roselynn', 'Arian', 'Capri', 'Grey', 'Remy', 'Kiam', 'Linley', 'Jema', 'Eyad', 'Miri', 'Zeke', 'Graeme', 'Lowen', 'Lesieli', 'Yekusiel', 'Aubrey', 'Charli', 'Emi', 'Gedalia', 'Sura', 'Titus', 'Saam', 'Adrielle', 'Dariel', 'Zac', 'Efrayim', 'Adreena', 'Othniel', 'Alysson', 'Hendrik', 'Fallyn', 'Aston', 'Mysha', 'Beauregard', 'Rozlyn', 'Anabell', 'Remi', 'Colt', 'Caralynn', 'Callin', 'Abdulaziz', 'Verity', 'Akemi', 'Daiana', 'Yerachmiel', 'Raelle', 'Kato', 'Alistair', 'Saud', 'Leia', 'Nikolai', 'Zehava', 'Romina', 'Davi', 'Fitzpatrick', 'Kamilla', 'Analee', 'Adeena', 'Antonella', 'Ariella', 'Mera', 'Jenson', 'Crimson', 'Nael', 'Beaumont', 'Zinnia', 'Hosanna', 'Taleen', 'Narayan', 'Ameila', 'Afaf', 'Manoah', 'Jeremias', 'Menashe', 'Radley', 'Muhammad', 'Darina', 'Salahuddin', 'Banning', 'Aira', 'Dahlia', 'Shavy', 'Beaux', 'Elif', 'Langston', 'Quinn', 'Valyn', 'Leahmarie']


In [69]:
scores_Africa = region_dict['Africa'][:100]
names_Africa = []
for i in range(100):
    name = scores_Africa[i][0]
    names_Africa.append(name)
print names_Africa

['Charlee', 'Raelyn', 'Emberly', 'Kathalina', 'Jader', 'Raelynne', 'Sloane', 'Suria', 'Brielle', 'Leyland', 'Wren', 'Paxton', 'Zuri', 'Sutton', 'Colbie', 'Kornelius', 'Rajon', 'Kace', 'Ghazal', 'Bow', 'Persephone', 'Ember', 'Grayson', 'Milanie', 'Eira', 'Freya', 'Khalifa', 'Case', 'Bay', 'Auri', 'Maci', 'Kenley', 'Benaiah', 'Flynn', 'Kase', 'Maverick', 'Jayce', 'Briggs', 'Navy', 'Rosali', 'Renad', 'Samil', 'Lennon', 'Gunnison', 'Niome', 'Aisleen', 'Seydi', 'Liam', 'Lamere', 'Ammanuel', 'Kiralyn', 'Raelynn', 'Cairo', 'Gemma', 'Henrik', 'Annalynn', 'Ariadne', 'Auset', 'Legacy', 'Krystell', 'Dimari', 'Kyuss', 'Lynlee', 'Keyston', 'Rechy', 'Axl', 'Vidur', 'Zecharia', 'Seydina', 'Naamah', 'Eloni', 'Zanovia', 'Aubri', 'Aubrei', 'Annaleigh', 'Rynn', 'Annaleah', 'Jordynn', 'Chapel', 'Raea', 'Adalina', 'Alister', 'Pristine', 'Nihaal', 'Fatiha', 'Lian', 'Kyrus', 'Cye', 'Jannatul', 'Aalaiyah', 'Tameem', 'Daxx', 'Osiah', 'Adilen', 'Lui', 'Naveena', 'Juliete', 'Azalea', 'Valentina', 'Demoni']


In [70]:
scores_Asia = region_dict['Asia'][:100]
names_Asia = []
for i in range(100):
    name = scores_Asia[i][0]
    names_Asia.append(name)
print names_Asia

['Sura', 'Ariella', 'Yerachmiel', 'Elif', 'Adrielle', 'Jenson', 'Raelle', 'Saud', 'Adreena', 'Kiam', 'Zehava', 'Suleiman', 'Aston', 'Remington', 'Charli', 'Jema', 'Viren', 'Beaux', 'Manoah', 'Kallen', 'Nikolai', 'Shavy', 'Remi', 'Othniel', 'England', 'Kilani', 'Raylon', 'Farida', 'Honour', 'Lesieli', 'Weston', 'Lian', 'Audriana', 'Kenzi', 'Alistair', 'Aira', 'Nishika', 'Arisha', 'Yekusiel', 'Dariel', 'Grey', 'Westin', 'Nash', 'Abdulaziz', 'Kember', 'Raylyn', 'Cartier', 'Baylen', 'Benaiah', 'Nosson', 'Valyn', 'Emmeline', 'Alysson', 'Saam', 'Callan', 'Daiana', 'Remy', 'Bowen', 'Karam', 'Kayler', 'Crimson', 'Ivanna', 'Efrayim', 'Raylynn', 'Narayan', 'Kato', 'Jeancarlos', 'Keaston', 'Callin', 'Atom', 'Hesston', 'Arian', 'Roniel', 'Cali', 'Kolt', 'Kora', 'Alasdair', 'Jovi', 'Chatham', 'Sloan', 'Westen', 'Darina', 'Kamilla', 'Rosali', 'Yohannes', 'Alister', 'Boone', 'Ariadne', 'Liem', 'Graeme', 'Sapphira', 'Zuri', 'Mysha', 'Batsheva', 'Aubri', 'Jubilee', 'Kenzie', 'Leevi', 'Setareh', 'Quinne

In [71]:
scores_Americas = region_dict['Americas'][:100]
names_Americas = []
for i in range(100):
    name = scores_Americas[i][0]
    names_Americas.append(name)
print names_Americas

['Grey', 'Nikolai', 'Sura', 'Remi', 'Jenson', 'Charli', 'Elif', 'Zehava', 'Yerachmiel', 'Ariella', 'Beaux', 'Nash', 'Kiam', 'Othniel', 'Suleiman', 'Aubrey', 'Dariel', 'Raelle', 'Alistair', 'Manoah', 'Aston', 'Shavy', 'Kallen', 'Saud', 'Daiana', 'Aira', 'Lian', 'Adreena', 'Adrielle', 'Raylyn', 'Farida', 'Viren', 'Remington', 'Darina', 'Emmeline', 'Kamilla', 'Arisha', 'Kenzi', 'Bowen', 'Alysson', 'Raylynn', 'Dahlia', 'Westin', 'Atom', 'Nishika', 'Cali', 'Sapphira', 'Kato', 'Crimson', 'Jema', 'England', 'Karam', 'Nosson', 'Roniel', 'Benaiah', 'Cartier', 'Matias', 'Kora', 'Jovi', 'Gideon', 'Boone', 'Weston', 'Dezi', 'Eli', 'Avni', 'Valyn', 'Remy', 'Ivanna', 'Honour', 'Bennett', 'Ezra', 'Leighton', 'Jubilee', 'Roselynn', 'Yekusiel', 'Callin', 'Ariadne', 'Rhys', 'Raylon', 'Aubri', 'Kenzie', 'Keaston', 'Leevi', 'Lesieli', 'Westen', 'Yousif', 'Samier', 'Alister', 'Emilyn', 'Sloan', 'Adriel', 'Narayan', 'Chevy', 'Liem', 'Alasdair', 'Lev', 'Setareh', 'Hadassah', 'Kilani', 'Rosali']


In [65]:
state_names = pd.read_csv('Datasets/StateNames.csv', delimiter = ',', usecols = [1, 2, 3, 4, 5])

In [66]:
state_names.head()

Unnamed: 0,Name,Year,Gender,State,Count
0,Mary,1910,F,AK,14
1,Annie,1910,F,AK,12
2,Anna,1910,F,AK,10
3,Margaret,1910,F,AK,8
4,Helen,1910,F,AK,7
