In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
from IPython.display import display

In [30]:
# get our state names again
state_names = pd.read_csv('Datasets/StateNames.csv', delimiter = ',', usecols = [1, 2, 3, 4, 5])
# slow, but less annoying than typing in manually...
state_list = np.unique(state_names['State'].values)

In [6]:
# get our most representative names
region_names = pd.read_csv('Datasets/repres_names/repres_names.csv', delimiter = ',')
display(region_names.head())

Unnamed: 0,Africa,Americas,Asia,Europe,Oceania
0,Charlee,Grey,Sura,Roxie,Sloan
1,Raelyn,Nikolai,Ariella,Leeta,Weston
2,Emberly,Sura,Yerachmiel,Lona,Solange
3,Kathalina,Remi,Elif,Susie,Boone
4,Jader,Jenson,Adrielle,Eugenie,Kiyomi


In [50]:
# requires a state_list and region_names global
def cross_best_names(region, year, byDecade=False):
    # build a dictionary of counts by state and then name
    # count by decade as an option
    years = range(year, year + 11 if byDecade else year + 1)
    state_names_by_year = state_names[state_names['Year'].isin(years)]
    names_dict = {}
    names = state_names_by_year.iloc[:,0].values
    counts = state_names_by_year.iloc[:,-1].values
    states = state_names_by_year.iloc[:,-2].values
    for i in range(len(state_names_by_year.index)):
        if states[i] not in names_dict:
            names_dict[states[i]] = {}
        if names[i] not in names_dict[states[i]]:
            names_dict[states[i]][names[i]] = counts[i]
        else:
            names_dict[states[i]][names[i]] += counts[i]
    # build our matrix of names by states
    df = pd.DataFrame(index=state_list)
    for n in region_names[region]:
        state_name_counts = []
        for s in state_list:
            if s in names_dict and n in names_dict[s]:
                state_name_counts.append(names_dict[s][n])
            else:
                state_name_counts.append(0)    
        df[n] = pd.Series(state_name_counts, index=state_list)
    return df

In [52]:
# examples
display(cross_best_names('Europe', 1940, byDecade=True).head())
display(cross_best_names('Europe', 1940).head())
display(cross_best_names('Asia', 1990).head())
display(cross_best_names('Europe', 1890).head())

Unnamed: 0,Roxie,Leeta,Lona,Susie,Eugenie,Parthenia,Sylvania,Dagmar,Olinda,Lesta,...,Pearlie,Rosetta,Theodosia,Norine,Aggie,Pinkie,Leonia,Floretta,Rowena,Freida
AK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL,93,0,0,727,0,0,0,0,0,0,...,430,382,0,0,0,83,0,11,0,115
AR,29,0,0,255,0,0,0,0,0,0,...,306,210,0,0,0,0,0,0,6,50
AZ,0,0,0,175,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,101,0,100,452,6,0,0,6,0,0,...,0,95,0,44,0,0,0,5,64,48


Unnamed: 0,Roxie,Leeta,Lona,Susie,Eugenie,Parthenia,Sylvania,Dagmar,Olinda,Lesta,...,Pearlie,Rosetta,Theodosia,Norine,Aggie,Pinkie,Leonia,Floretta,Rowena,Freida
AK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL,8,0,0,73,0,0,0,0,0,0,...,36,23,0,0,0,12,0,0,0,0
AR,9,0,0,21,0,0,0,0,0,0,...,34,22,0,0,0,0,0,0,0,7
AZ,0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,5,0,14,15,0,0,0,0,0,0,...,0,12,0,5,0,0,0,0,8,5


Unnamed: 0,Sura,Ariella,Yerachmiel,Elif,Adrielle,Jenson,Raelle,Saud,Adreena,Kiam,...,Sapphira,Zuri,Mysha,Batsheva,Aubri,Jubilee,Kenzie,Leevi,Setareh,Quinne
AK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,0,25,0,0,0,0,0,0,0,0,...,0,7,0,0,0,0,7,0,0,0


Unnamed: 0,Roxie,Leeta,Lona,Susie,Eugenie,Parthenia,Sylvania,Dagmar,Olinda,Lesta,...,Pearlie,Rosetta,Theodosia,Norine,Aggie,Pinkie,Leonia,Floretta,Rowena,Freida
AK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
