## Setup

In [1]:
from lets_plot import *
import numpy as np
import pandas as pd

LetsPlot.setup_html(isolated_frame=True)

In [2]:
df = pd.read_csv("https://github.com/byuidatascience/data4names/raw/master/data-raw/names_year/names_year.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393384 entries, 0 to 393383
Data columns (total 54 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   name    393384 non-null  object 
 1   year    393384 non-null  int64  
 2   AK      393384 non-null  float64
 3   AL      393384 non-null  float64
 4   AR      393384 non-null  float64
 5   AZ      393384 non-null  float64
 6   CA      393384 non-null  float64
 7   CO      393384 non-null  float64
 8   CT      393384 non-null  float64
 9   DC      393384 non-null  float64
 10  DE      393384 non-null  float64
 11  FL      393384 non-null  float64
 12  GA      393384 non-null  float64
 13  HI      393384 non-null  float64
 14  IA      393384 non-null  float64
 15  ID      393384 non-null  float64
 16  IL      393384 non-null  float64
 17  IN      393384 non-null  float64
 18  KS      393384 non-null  float64
 19  KY      393384 non-null  float64
 20  LA      393384 non-null  float64
 21  MA      39

## Q1

In [68]:
tyler = df[df['name'] == 'Tyler']

In [69]:
tyler

Unnamed: 0,name,year,AK,AL,AR,AZ,CA,CO,CT,DC,...,TX,UT,VA,VT,WA,WI,WV,WY,Total,name_lower
371389,Tyler,1916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,tyler
371390,Tyler,1925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,tyler
371391,Tyler,1939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,tyler
371392,Tyler,1940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,tyler
371393,Tyler,1944,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,tyler
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371460,Tyler,2011,26.0,65.5,89.0,193.0,429.0,143.0,129.0,16.0,...,274.0,126.0,250.0,15.0,79.0,152.0,88.0,16.0,6339.0,tyler
371461,Tyler,2012,34.0,126.0,70.0,125.0,384.0,60.5,131.0,20.0,...,253.0,100.0,201.0,10.0,77.0,117.0,52.0,14.0,5526.0,tyler
371462,Tyler,2013,20.0,115.0,63.0,124.0,313.5,94.0,90.0,26.0,...,214.5,75.0,195.0,9.0,68.0,97.0,44.0,12.0,4866.5,tyler
371463,Tyler,2014,15.0,51.5,57.0,82.0,295.5,98.0,82.0,12.0,...,224.0,80.0,159.0,6.0,105.0,76.0,35.0,0.0,4187.0,tyler


In [None]:
def plot_name_trends(df: pd.DataFrame, target_names: list[str], ref_mark: any = None, years: tuple[int] = (1910, 2015)):
    """
    Plots historical trends for one or more baby names.

    Parameters:
        df (pd.DataFrame): DataFrame containing columns: year, name, states..., total
        target_names (list[str]): List of names to visualize
        ref_mark (any): Reference mark to add to the plot (optional)
        years (tuple[int]): The minimum and maximum years to which to limit the X axis
    """
    
    if isinstance(target_names, str):
        target_names = [target_names]
    df['name_lower'] = df['name'].str.lower()
    name_map = {name.lower(): name for name in target_names}
    filtered = df[df['name_lower'].isin(name_map.keys())].copy()
    filtered['name_display'] = filtered['name_lower'].map(name_map)

    if filtered.empty:
        print("No data found for the provided names.")
        return None

    # Group by year and display name
    grouped = filtered.groupby(['year', 'name_display'])['Total'].sum().reset_index()

    # Create the plot
    p = (ggplot(grouped, aes(x='year', y='Total', color='name_display')) +
         geom_line(size=1.2) +
         ggtitle(f"Historical Popularity of Names: {', '.join(target_names)}") +
         xlab("Year") +
         ylab("Number of Babies Named") +
         scale_x_continuous(limits=years, format='d') +
         theme_minimal())
    
    if ref_mark:
        p += ref_mark

    p.show()

In [None]:
rm_t = (geom_vline(xintercept=2004, color='gray', linetype='dashed') +
         geom_text(x=2004,
                   y=0,
                   label='The year I was born',
                   angle=90,
                   hjust='left',
                   vjust='bottom',
                   nudge_x=-1,
                   color='#555555',
                   size=8))
plot_name_trends(df, 'Tyler', ref_mark=rm_t)

## Q2

In [71]:
brittany = df[df['name'] == 'Brittany']

In [None]:
brittany

In [116]:
import numpy as np

# Extract year and total columns
years = brittany['year']
counts = brittany['Total']

# Weighted stats
weighted_mean = np.average(years, weights=counts)
weighted_std = np.sqrt(np.average((years - weighted_mean)**2, weights=counts))

# Cumulative distribution to find middle 50%
brittany_sorted = brittany.sort_values('year')
brittany_sorted['cum_total'] = brittany_sorted['Total'].cumsum()
total_births = brittany_sorted['Total'].sum()

lower_bound = brittany_sorted[brittany_sorted['cum_total'] >= total_births * 0.25]['year'].iloc[0]
upper_bound = brittany_sorted[brittany_sorted['cum_total'] >= total_births * 0.75]['year'].iloc[0]

low_five = brittany_sorted[brittany_sorted['cum_total'] <= total_births * 0.05]['year'].iloc[0]
high_five = brittany_sorted[brittany_sorted['cum_total'] >= total_births * 0.95]['year'].iloc[0]

# Optional: peak year
peak_year = brittany_sorted.loc[brittany_sorted['Total'].idxmax(), 'year']


In [117]:
print(f'''
The name Brittany peaked in popularity around {peak_year}, with a distribution resembling a classic bell curve centered on the late '80s and early '90s.
Statistically, the weighted average birth year for Brittany is {weighted_mean:.1f}, with a standard deviation of about {weighted_std:.1f} years, suggesting that most of the name’s usage is tightly clustered.
About 50% of all babies named Brittany were born between {lower_bound} and {upper_bound}, highlighting how the name dominated a relatively narrow cultural window before falling sharply afterward.
''')
print(f'< {low_five}; > {high_five}')


The name Brittany peaked in popularity around 1990, with a distribution resembling a classic bell curve centered on the late '80s and early '90s.
Statistically, the weighted average birth year for Brittany is 1991.4, with a standard deviation of about 5.6 years, suggesting that most of the name’s usage is tightly clustered.
About 50% of all babies named Brittany were born between 1988 and 1994, highlighting how the name dominated a relatively narrow cultural window before falling sharply afterward.

< 1968; > 2000


In [110]:
rm_b = geom_rect(
    xmin=lower_bound, xmax=upper_bound,
    ymin=0, ymax=brittany['Total'].max(),
    fill='rgba(128, 128, 128, 0.5)',
    color='rgba(128, 128, 128, 0.25)')
plot_name_trends(df, 'Brittany', ref_mark=rm_b, years=(1970, 2015))

## Q3

In [119]:
plot_name_trends(df, ['Mary', 'Martha', 'Paul', 'Peter'], years=(1920, 2000))

## Q4

In [118]:
ariel = df[df['name'] == 'Ariel']
ariel

Unnamed: 0,name,year,AK,AL,AR,AZ,CA,CO,CT,DC,...,TX,UT,VA,VT,WA,WI,WV,WY,Total,name_lower
28934,Ariel,1918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,ariel
28935,Ariel,1956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,ariel
28936,Ariel,1957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,ariel
28937,Ariel,1959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,ariel
28938,Ariel,1960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,ariel
28939,Ariel,1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,ariel
28940,Ariel,1962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,ariel
28941,Ariel,1963,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,ariel
28942,Ariel,1964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,ariel
28943,Ariel,1965,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,ariel


In [56]:
rm_a = (geom_vline(xintercept=1989, color='gray', linetype='dashed') +
         geom_text(x=1989,
                   y=4000,
                   label='The Little Mermaid Released',
                   angle=90,
                   hjust='right',
                   vjust='bottom',
                   nudge_x=-1,
                   color='#555555',
                   size=7))
plot_name_trends(df, ['Ariel'], ref_mark=rm_a)

## Stretch

In [64]:
def plot_elliot_style(df):
    # Filter and group
    data = df[df['name'] == 'Elliot']

    grouped = data.groupby('year')['Total'].sum().reset_index()
    grouped['name'] = 'Elliot'

    # Create base plot
    p = (
        ggplot(grouped, aes(x='year', y='Total', color='name')) +
        geom_line(size=1.5, alpha=0.8) +
        scale_color_manual(values={'Elliot': '#6e79f9'}) +
        ggtitle(f"Elliot... What?") +
        xlab("year") + ylab("Total") +
        scale_x_continuous(breaks=list(range(1950, 2021, 10)), limits=(1950,2025), format='d', expand=[0, 0]) +
        scale_y_continuous(breaks=list(range(0, int(grouped['Total'].max()) + 1, 200)),) +
        theme_light() +
        theme(
            panel_background=element_rect(fill='#e5ecf6'),
            panel_grid_major=element_line(color='white', size=0.5),
            panel_grid_minor=element_blank(),
            legend_position='right',
            plot_title=element_text(size=16, face='bold'),
            axis_title=element_text(size=12),
            axis_text=element_text(size=10)) +
        ggsize(900, 400))

    # Add vertical dashed red lines with text annotations
    event_years = [1982, 1985, 2002]
    event_labels = ['E.T Released', 'Second Release', 'Third Release']

    for x, label in zip(event_years, event_labels):
        p += geom_vline(xintercept=x, color='red', linetype='dashed', size=1.2)
        p += geom_text(
            x=x, y=max(grouped['Total']),
            label=label,
            angle=0,
            hjust='right' if label == 'E.T Released' else 'left',
            nudge_x=-0.5 if label == 'E.T Released' else 0.5,
            vjust='bottom',
            size=6, color='black')

    p.show()

In [65]:
plot_elliot_style(df)

## Other Info - Quiz

In [126]:
oliver_ut = df[df['name'] == 'Oliver'][['UT']]
oliver_ut

Unnamed: 0,UT
295866,0.0
295867,0.0
295868,0.0
295869,0.0
295870,0.0
...,...
295967,145.0
295968,144.0
295969,174.0
295970,224.0


In [127]:
oliver_ut.sum()

UT    1704.0
dtype: float64

In [128]:
felisha = df[df['name'] == 'Felisha']
felisha

Unnamed: 0,name,year,AK,AL,AR,AZ,CA,CO,CT,DC,...,TX,UT,VA,VT,WA,WI,WV,WY,Total,name_lower
135125,Felisha,1964,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,felisha
135126,Felisha,1965,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,felisha
135127,Felisha,1966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,felisha
135128,Felisha,1967,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,felisha
135129,Felisha,1968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,felisha
135130,Felisha,1969,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,felisha
135131,Felisha,1970,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,felisha
135132,Felisha,1971,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,felisha
135133,Felisha,1972,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,81.0,felisha
135134,Felisha,1973,0.0,6.0,8.0,0.0,5.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76.0,felisha


In [130]:
test1 = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
test1

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [131]:
test2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c'])
test2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
test3 = pd.DataFrame({'col1' = [1, 2], 'col2' = [3, 4]}) # yes I know this doesn't work
test3

SyntaxError: cannot assign to literal here. Maybe you meant '==' instead of '='? (2892204452.py, line 1)