In [None]:
# import library
import pandas as pd

# read data into pandas dataframe (df)
names = pd.read_csv("/Users/Nicole/Documents/Names/baby-names.csv")
# show first five rows of df
names.head()

In [None]:
# show last five rows of df
names.tail()

In [None]:
# show where girl names start
names[:][128998:129003]

In [None]:
# count how often each name appears in the df
pd.set_option('display.max_rows', 10)
names["name"].value_counts(ascending = False)

In [None]:
# count how many unique names are in the df
names["name"].nunique()

In [None]:
# count how many unique years are in the df
names["year"].nunique()

In [None]:
# return names that appear less than once in the df
names['name'].value_counts()[names['name'].value_counts()<2]

In [None]:
# print rows where column value for name appeared one time in the df
print(names.groupby("name").filter(lambda x: len(x) == 1))

# show count for how many of these names each year had
u = names.groupby("name").filter(lambda x: len(x) == 1)
u['year'].value_counts()

In [None]:
# What boy names are popular for all years?

# split names df into df with boy names
b_names = names[names.sex == 'boy']
# get rows where name appears 129 times, since 129 years are represented in the df
b_names129 = b_names.groupby("name").filter(lambda x: len(x) == 129)
# print number of boy names popular for all years and the names alphabetically
print(b_names129['name'].nunique())
print(b_names129['name'].value_counts().sort_index())

In [None]:
# What girl names are popular for all years?

# split names df into df with girl names
g_names = names[names.sex == 'girl']
# get rows where name appears 129 times
g_names129 = g_names.groupby("name").filter(lambda x: len(x) == 129)
# print number of boy names popular for all years and the names alphabetically
print(g_names129['name'].nunique())
print(g_names129['name'].value_counts().sort_index())

In [None]:
# What names are popular for both boys and girls?

# split names df into df with boy names and df with girl names
b_names = names[names.sex == 'boy']
g_names = names[names.sex == 'girl']

# list unique boy names
b_list = []
for n in range (0,129000):
    if b_names['name'][n] not in b_list:
        b_list.append(b_names['name'][n])

# list unique girl names
g_list = []
for n in range (129000,258000):
    if g_names['name'][n] not in g_list:
        g_list.append(g_names['name'][n])

# find intersection of two lists - boy names and girl names
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return len(lst3), lst3

print(intersection(b_list, g_list))

In [None]:
# What names are popular for all years for both boys and girls? 
# Based on earlier value counts, Jessie is the only name appearing 258 times
# Since 129*2=258, Jessie must be popular for both boys and girls in all years

# split names df into df with boy names and df with girl names
b_names = names[names.sex == 'boy']
g_names = names[names.sex == 'girl']
# get boy names appearing 129 times and girl names appearing 129 times
b_names129 = b_names.groupby("name").filter(lambda x: len(x) == 129)
g_names129 = g_names.groupby("name").filter(lambda x: len(x) == 129)

# find intersection of two lists - boy names for all years and girl names for all years
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return len(set(lst3)), set(lst3)

print(intersection(b_names129['name'].tolist(),g_names129['name'].tolist()))

In [None]:
# For each year, what percent of boys born had a name in the top 1000 names?

b_names = names[names.sex == 'boy']
for y in range (1880,2009):
    y_df = b_names[b_names.year == y]
    y_sum = y_df['percent'].sum()
    print(y, y_sum)

In [None]:
# For each year, what percent of girls born had a name in the top 1000 names?

g_names = names[names.sex == 'girl']
for y in range (1880,2009):
    y_df = g_names[g_names.year == y]
    y_sum = y_df['percent'].sum()
    print(y, y_sum)

In [None]:
# For each year, how many names made up the top 50% for boys?

b_names = names[names.sex == 'boy']
for y in range (1880,2009):
    y_df = b_names[b_names.year == y]
    x = 0
    y_sum = 0
    while x < len(y_df) and y_sum < 0.50:
        if y_df['percent'][0:x].sum() < 0.50:
            x = x+1
        else:
            y_sum = y_df['percent'][0:x].sum()
            print(y, x, y_sum)

In [None]:
# For each year, how many names made up the top 50% for girls?

g_names = names[names.sex == 'girl']
for y in range (1880,2009):
    y_df = g_names[g_names.year == y]
    x = 0
    y_sum = 0
    while x < len(y_df) and y_sum < 0.50:
        if y_df['percent'][0:x].sum() < 0.50:
            x = x+1
        else:
            y_sum = y_df['percent'][0:x].sum()
            print(y, x, y_sum)

In [None]:
# Except for 1880, what percent of boy names in each year were also popular in the previous year?

b_names = names[names.sex == 'boy']
for y in range (1881,2009):
    y_next = b_names[b_names.year == y]
    y_last = b_names[b_names.year == y-1]
    overlap = [value for value in y_next['name'].tolist() if value in y_last['name'].tolist()]
    print(y, len(overlap)/1000)

In [None]:
# Except for 1880, what percent of girl names in each year were also popular in the previous year?

g_names = names[names.sex == 'girl']
for y in range (1881,2009):
    y_next = g_names[g_names.year == y]
    y_last = g_names[g_names.year == y-1]
    overlap = [value for value in y_next['name'].tolist() if value in y_last['name'].tolist()]
    print(y, len(overlap)/1000)

In [None]:
# Plot popularity of the name Jessie by year for boys

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

b_names = names[names.sex == 'boy']
b_Jessie = b_names[b_names.name == 'Jessie']
b_Jessie.reset_index(inplace=True)
b_Jessie['index'] = b_Jessie['index'] % 1000

import seaborn as sns
import matplotlib.pyplot as plt
plot = sns.jointplot(data = b_Jessie, x='year', y='index', color='b')
plt.show()

In [None]:
# Plot popularity of the name Jessie by year for girls

pd.options.mode.chained_assignment = None  # default='warn'

g_names = names[names.sex == 'girl']
g_Jessie = g_names[g_names.name == 'Jessie']
g_Jessie.reset_index(inplace=True)
g_Jessie['index'] = g_Jessie['index'] % 1000

import seaborn as sns
import matplotlib.pyplot as plt
plot = sns.jointplot(data = g_Jessie, x='year', y='index', color='b')
plt.show()

In [367]:
pd.options.mode.chained_assignment = None  # default='warn'

g_names = names[names.sex == 'girl']
g_names.reset_index(inplace=True)
g_names['index'] = (g_names['index'] % 1000) + 1
g_ranks = pd.DataFrame(index=range(8036), columns=range(3))

list_gnames = g_names.name.unique()
i = 0
for y in range (1880,1882):
    g_year = g_names[g_names.year == y]
    for n in list_gnames:
        g_year_n = g_year[g_year.name == n]
        if len(g_year_n.index) > 0:
            g_ranks[0][i] = y
            g_ranks[1][i] = n
            rank = g_year_n['index'].tolist()
            g_ranks[2][i] = rank[0]
            i = i+1
        else:
            g_ranks[0][i] = y
            g_ranks[1][i] = n
            g_ranks[2][i] = 1001
            i = i+1

         0          1     2
0     1880       Mary     1
1     1880       Anna     2
2     1880       Emma     3
3     1880  Elizabeth     4
4     1880     Minnie     5
...    ...        ...   ...
8031  1881     Laylah  1001
8032  1881   Carleigh  1001
8033  1881     Kenley  1001
8034  1881     Sloane  1001
8035  1881    Elianna  1001

[8036 rows x 3 columns]


In [402]:
# Which name changed the most in popularity between two years
g_ranks_y1 = g_ranks[g_ranks[0] == 1880]
g_ranks_y2 = g_ranks[g_ranks[0] == 1881]

y1_y2 = []
for n in list_gnames:
    g_y1 = g_ranks_y1[g_ranks_y1[1] == n]
    g_y2 = g_ranks_y2[g_ranks_y2[1] == n]
    r_change =  g_y1[2].tolist()[0] - g_y2[2].tolist()[0]
    if abs(r_change) < 450:
        r_change = "no change"
    else:
        n_y1_y2 = []
        n_y1_y2.append(n)
        n_y1_y2.append(r_change)
        y1_y2.append(n_y1_y2)
print(y1_y2)

[['Peggy', -514], ['Celestine', -478], ['Edmonia', -479], ['Gena', -473], ['Wilda', -455], ['Isa', 464], ['Adell', 496], ['Celeste', 515], ['Lonie', 476], ['Zadie', 462]]
