### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from __future__ import division
%matplotlib inline
from IPython.display import display
from sklearn.cluster import KMeans

### Get 100 most representative names for each region

In [None]:
# get our initial data
immig = pd.read_csv('Datasets/ImmigrationByRegion.csv').transpose()
immig_regions = immig.iloc[0]
immig = immig[1:]
immig = immig.rename(columns = immig_regions)
immig = immig[::-1]
display(immig)

In [None]:
# then clean our names
names = pd.read_csv('Datasets/NationalNames.csv', delimiter = ',', usecols = [1, 2, 3, 4])
names = names.groupby(['Name', 'Year'])['Count'].sum()
names = names.unstack(level=0)
display(names.head())

In [None]:
# now groupby 10 and at last we have what we want
names_10 = names.groupby([(y / 10) * 10 for y in names.index.values]).sum().fillna(0)
display(names_10)

In [None]:
# get some useful tools out of the way now
# to check for empty values in immigration
import re
numeric = re.compile("\d+(?:,\d+)?")
# display(s[s.str.match("\d+(?:,\d+)?")])
# to normalize series
def normalize(series):
    max_val = series.max()
    min_val = series.min()
    return (series - min_val) / (max_val - min_val)
def standardize(series):
    return (series - series.mean()) / series.std()

In [None]:
region_dict = {}
# short circuit example:
# for region in immig.columns[:5]:
for region in immig.columns:
    # this takes a while, so just to track progress--
    print 'Calculating ', region, ' :',
    region_vals = immig[region].fillna('(NA)')
    # filter for only immigration years with present values
    #region_vals = region_vals[region_vals.str.match("\d+(?:,\d+)?")]
    # find the years both names and immigration have
    # handle string nature of immigration years
    common_years = names_10.index.intersection(region_vals.index.map(int))
    # filter immigration years by common years, handling int/str switch
    region_vals = region_vals[map(str, common_years)]
    # then map back to int, handling commas
    region_vals = region_vals.apply(lambda x: int(x.replace(',', '')))
    # and normalize
    region_vals = normalize(region_vals)
    region_dict[region] = []
    name_num = 0
    # short circuit example:
    # for name in names_10.columns[:10000]:
    for name in names_10.columns:
        # more progress tracking
        if name_num % 5000 == 0:
            print name, '...',
        name_vals = names_10[name]
        name_vals = name_vals[common_years]
        name_vals = normalize(name_vals)
        # handle some empty arrays because 2010 exclusion
        if name_vals.isnull().any():
            score = -10
        else:
            # first value is true--actual immigration data
            # second value is predict--name data we're trying to use as model
            score = r2_score(region_vals, name_vals)
        region_dict[region].append((name, score))
        name_num += 1
    print 'done!' 

In [None]:
for region in region_dict:
    name_scores = region_dict[region]
    name_scores = sorted(name_scores, key=lambda x: -x[1])
    name_scores = [(x[0], round(x[1], 2)) for x in name_scores]
    region_dict[region] = name_scores
    print region, ': ', name_scores[:100]

### Save names in csv files
You can skip this section since these individual files were later compiled into a single file: "represnames.csv".

In [None]:
scores_Total = region_dict['Total'][:100]
names_Total = []
for i in range(100):
    name = scores_Total[i][0]
    names_Total.append(name)

Total_csv = pd.DataFrame({'Names': names_Total})
Total_csv.to_csv('names_Total.csv', sep=',', columns = ['Names'], header = ['Names'],
                         index = False)

In [None]:
scores_Europe = region_dict['Europe'][:100]
names_Europe = []
for i in range(100):
    name = scores_Europe[i][0]
    names_Europe.append(name)

Europe_csv = pd.DataFrame({'Names': names_Europe})
Europe_csv.to_csv('names_Europe.csv', sep=',', columns = ['Names'], header = ['Names'],
                         index = False)

In [None]:
scores_Oceania = region_dict['Oceania'][:100]
names_Oceania = []
for i in range(100):
    name = scores_Oceania[i][0]
    names_Oceania.append(name)

Oceania_csv = pd.DataFrame({'Names': names_Oceania})
Oceania_csv.to_csv('names_Oceania.csv', sep=',', columns = ['Names'], header = ['Names'],
                         index = False)

In [None]:
scores_Africa = region_dict['Africa'][:100]
names_Africa = []
for i in range(100):
    name = scores_Africa[i][0]
    names_Africa.append(name)

Africa_csv = pd.DataFrame({'Names': names_Africa})
Africa_csv.to_csv('names_Africa.csv', sep=',', columns = ['Names'], header = ['Names'],
                         index = False)

In [None]:
scores_Asia = region_dict['Asia'][:100]
names_Asia = []
for i in range(100):
    name = scores_Asia[i][0]
    names_Asia.append(name)

Asia_csv = pd.DataFrame({'Names': names_Asia})
Asia_csv.to_csv('names_Asia.csv', sep=',', columns = ['Names'], header = ['Names'],
                         index = False)

In [None]:
scores_Americas = region_dict['Americas'][:100]
names_Americas = []
for i in range(100):
    name = scores_Americas[i][0]
    names_Americas.append(name)

Americas_csv = pd.DataFrame({'Names': names_Americas})
Americas_csv.to_csv('names_Americas.csv', sep=',', columns = ['Names'], header = ['Names'],
                         index = False)

### Most representative names by state

In [None]:
# get our state names again
state_names = pd.read_csv('Datasets/StateNames.csv', delimiter = ',', usecols = [1, 2, 3, 4, 5])
# slow, but less annoying than typing in manually...
state_list = np.unique(state_names['State'].values)

In [None]:
# get our most representative names
region_names = pd.read_csv('Datasets/repres_names/repres_names.csv', delimiter = ',')
display(region_names.head())

In [None]:
# requires a state_list and region_names global
def cross_best_names(region, year, byDecade=False):
    # build a dictionary of counts by state and then name
    # count by decade as an option
    years = range(year, year + 11 if byDecade else year + 1)
    state_names_by_year = state_names[state_names['Year'].isin(years)]
    names_dict = {}
    names = state_names_by_year.iloc[:,0].values
    counts = state_names_by_year.iloc[:,-1].values
    states = state_names_by_year.iloc[:,-2].values
    for i in range(len(state_names_by_year.index)):
        if states[i] not in names_dict:
            names_dict[states[i]] = {}
        if names[i] not in names_dict[states[i]]:
            names_dict[states[i]][names[i]] = counts[i]
        else:
            names_dict[states[i]][names[i]] += counts[i]
    # build our matrix of names by states
    df = pd.DataFrame(index=state_list)
    for n in region_names[region]:
        state_name_counts = []
        for s in state_list:
            if s in names_dict and n in names_dict[s]:
                state_name_counts.append(names_dict[s][n])
            else:
                state_name_counts.append(0)    
        df[n] = pd.Series(state_name_counts, index=state_list)
    return df

### PCA Visual Analysis

In [None]:
regions = pd.read_csv('Datasets/regions.csv', delimiter = ',', usecols = [2], header=None)

In [None]:
#Function to plot first two principal components
def pca_per_decaderegion(region, decade, ax):
    x = cross_best_names(region, decade, byDecade=True)
    df_norm = (x-x.mean())/x.std()
    df_norm = df_norm.fillna(0)
    #Apply PCA to data and get the top 2 axes of maximum variation
    pca = PCA(n_components=2)
    pca.fit(df_norm.values)

    #Project to the data onto the two axes
    x_reduced = pca.transform(df_norm.values)
    
    region_x = pd.DataFrame(x_reduced, columns=['PC-1', 'PC-2'])
    us_region = region_x.join(regions)
    
    #print x_reduced
    #Visualized our reduced data
    #region.ix[region['region']== 'South', 1]
    ax.scatter(us_region.ix[us_region.iloc[:,2] == 'South', 0], us_region.ix[us_region.iloc[:,2] == 'South', 1], 
           color='b', label = 'South')
    ax.scatter(us_region.ix[us_region.iloc[:,2] == 'West', 0], us_region.ix[us_region.iloc[:,2] == 'West', 1], 
           color='g', label = 'West')
    ax.scatter(us_region.ix[us_region.iloc[:,2] == 'Midwest', 0], us_region.ix[us_region.iloc[:,2] == 'Midwest', 1], 
           color='r', label = 'Midwest')
    ax.scatter(us_region.ix[us_region.iloc[:,2] == 'Northeast', 0], us_region.ix[us_region.iloc[:,2] == 'Northeast', 1], 
           color='y', label = 'Northeast')
    ax.set_xlabel('Component 1')
    ax.set_ylabel('Component 2')
    ax.set_title(str(region) + str(decade)+'data projected onto the first 2 PCA components')
    ax.legend()
    
    return ax

In [None]:
#plotting first two principal components for Europe per decade
fig, ax = plt.subplots(5, 2, figsize = (20,25))
ax[0,0] = pca_per_decaderegion('Europe',1910, ax[0,0])
ax[0,1] = pca_per_decaderegion('Europe',1920, ax[0,1])
ax[1,0] = pca_per_decaderegion('Europe',1930, ax[1,0])
ax[1,1] = pca_per_decaderegion('Europe',1940, ax[1,1])
ax[2,0] = pca_per_decaderegion('Europe',1950, ax[2,0])
ax[2,1] = pca_per_decaderegion('Europe',1960, ax[2,1])
ax[3,0] = pca_per_decaderegion('Europe',1970, ax[3,0])
ax[3,1] = pca_per_decaderegion('Europe',1980, ax[3,1])
ax[4,0] = pca_per_decaderegion('Europe',1990, ax[4,0])
ax[4,1] = pca_per_decaderegion('Europe',2000, ax[4,1])

#plt.tight_layout()
plt.show()

In [None]:
fig.savefig('pca_europe.png', bbox_inches='tight')

### Names with most variance in principal components

In [None]:
#function to print names that account for most variance in each principal component
def babyname_variance_explained(region, decade):
    x = cross_best_names(region, decade, byDecade=True)
    df_norm = (x-x.mean())/x.std()
    df_norm = df_norm.fillna(0)
    pca = PCA(n_components=2)
    pca.fit(df_norm.values)
    #Project to the data onto the two axes
    x_reduced = pca.transform(df_norm.values)
    
    identify = pd.DataFrame(pca.components_.T, columns=['PC-1', 'PC-2'], index=x.columns)

    identify_abs = identify.abs()
    
    print "Most represented names in", region , str(decade) + ':' , identify_abs['PC-1'].idxmax(),identify_abs['PC-2'].idxmax()
    identify_abs

### Most unique name by state

In [None]:
def unique_name_bystate(region, decade):
    x = cross_best_names(region, decade, byDecade=True)
    df_norm = (x-x.mean())/x.std()
    df_norm = df_norm.fillna(0)
    df_norm.head()

    return df_norm.idxmax(axis=1)

In [None]:
eu_names_1910 = unique_name_bystate('Europe',1910)
eu_names_1920 = unique_name_bystate('Europe',1920)
eu_names_1930 = unique_name_bystate('Europe',1930)
eu_names_1940 = unique_name_bystate('Europe',1940)
eu_names_1950 = unique_name_bystate('Europe',1950)
eu_names_1960 = unique_name_bystate('Europe',1960)
eu_names_1970 = unique_name_bystate('Europe',1970)
eu_names_1980 = unique_name_bystate('Europe',1980)
eu_names_1990 = unique_name_bystate('Europe',1990)
eu_names_2000 = unique_name_bystate('Europe',2000)
eu_names_2010 = unique_name_bystate('Europe',2010)

In [None]:
states_list = list(eu_names_1910.index.values)

In [None]:
#make into a dataframe
d = {1910 : pd.Series(eu_names_1910.values, index = states_list),
    1920 : pd.Series(eu_names_1920.values, index = states_list),
    1930 : pd.Series(eu_names_1930.values, index = states_list),
    1940 : pd.Series(eu_names_1940.values, index = states_list),
    1950 : pd.Series(eu_names_1950.values, index = states_list),
    1960 : pd.Series(eu_names_1960.values, index = states_list),
    1970 : pd.Series(eu_names_1970.values, index = states_list),
    1980 : pd.Series(eu_names_1980.values, index = states_list),
    1990 : pd.Series(eu_names_1990.values, index = states_list),
    2000 : pd.Series(eu_names_2000.values, index = states_list),
    2010 : pd.Series(eu_names_2010.values, index = states_list),}

In [None]:
df = pd.DataFrame(d, index=states_list, columns=decades)

In [None]:
df.to_csv('Datasets/unique_names_bystate.csv')

### K-means Clustering

In [None]:
region = 'Europe'
decade = 1920
x = cross_best_names(region, decade, byDecade=True)
df_norm = (x-x.mean())/x.std()
df_norm = df_norm.fillna(0)
pca = PCA(n_components=2)
pca.fit(df_norm.values)
x_reduced = pca.transform(df_norm.values)
region_x = pd.DataFrame(x_reduced, columns=['PC-1', 'PC-2'])
us_region = region_x.join(regions)
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans = kmeans.fit(us_region.iloc[:,[0,1]])
# make a copy, just so we don't mess with the previous for testing
classes = us_region.copy()
classes['guess'] = pd.Series(kmeans.labels_, index=us_region.index)
display(classes)