# Spencergallardo.com Users

I have collected data on users who visit my website. I want to know where they are located by city, and if city is unavailable then by the state they live in.

I Want to know:

1. What city has the most female users.
2. What city has the most male users.
3. What state has the largest number of users over the age of 30.
4. What state has the largest number of users under the age of 30.

In [6]:
import numpy as np
import pandas as pd
#set some pandas options controling output format
pd.set_option('display.notebook_repr_html',True) # output as flat text and not HTML
pd.set_option('display.max_rows', None) # this is the maximum number of rows we will display
pd.set_option('display.max_columns', None) # this is the maximum number of rows we will display

# read the csv into a dataframe, and force postal_code to be interpreted as a string.
users = pd.read_csv('data/sg_users.csv', dtype={'zip': str})

# Display the columns in the dataframe, and get a peak at how it's formatted
users.head()

Unnamed: 0,first,last,gender,email,address,city,state,zip,age
0,Mathilda,Barton,Male,hitesre@nabju.bd,822 Nagaj Lane,Ijijne,TX,46333,19
1,Teresa,Foster,Female,ekzarmes@pefalez.om,1761 Kano Way,Lidkegimi,VA,75671,36
2,Keith,Norman,Female,enoufi@mufpa.hk,1734 Nadco Center,Nadferi,KS,19335,50
3,Vera,French,Female,si@genemhu.pr,1994 Juuvu Terrace,Gefopobob,ME,62221,61
4,Nannie,Baker,Female,kipaweg@uto.et,809 Kegadu Plaza,Kemuhla,AR,79723,56


In [28]:
# get rid of rows without a city, state, zip, and age
users.dropna(subset=['city', 'state', 'zip', 'age'], how='all', inplace=True)

# create a dataframe with the number users younger than 30 by State. Dropping NaN state values.
younger_users = users.dropna(subset=['state'], how='any') \
                [(users.dropna(subset=['state'], how='any').age <= 30)] \
                .groupby('state') \
                .state.count() \
                .reset_index(name='younger_30') \
                .sort_values(['state'], ascending=True)

# create a dataframe with the number users older than 30 by State. Dropping NaN state values.
older_users = users.dropna(subset=['state'], how='any') \
                [(users.dropna(subset=['state'], how='any').age >= 30)] \
                .groupby('state') \
                .state.count() \
                .reset_index(name='older_30') \
                .sort_values(['state'], ascending=True)

# join the two series into one dataframe
younger_users = younger_users.join(older_users.older_30)

# return the state with the users under than 30
younger_users.sort_values('younger_30', ascending=False).head(1)

Unnamed: 0,state,younger_30,older_30
4,FL,3,1


In [29]:
# return the state with the most male users
younger_users.sort_values('older_30', ascending=False).head(1)

Unnamed: 0,state,younger_30,older_30
1,AZ,3,4


In [43]:
# create a dataframe with the number of female users by city
city_users = users.dropna(subset=['city'], how='any') \
                [(users.dropna(subset=['city'], how='any').gender == 'Female')] \
                .groupby(['city', 'state']) \
                .city.count() \
                .reset_index(name='female_count') \
                .sort_values(['city','state'], ascending=[True,True])

# create a dataframe with the number of male users by city
male_users = users.dropna(subset=['city'], how='any') \
                [(users.dropna(subset=['city'], how='any').gender == 'Male')] \
                .groupby(['city','state']) \
                .city.count() \
                .reset_index(name='male_count') \
                .sort_values(['city', 'state'], ascending=[True,True])
                
# join the two series into one dataframe
city_users = city_users.join(male_users.male_count)
city_users.male_count = male_users.male_count.astype(int)

# return the city with the most female users
city_users.sort_values('female_count', ascending=False).head(1)

Unnamed: 0,city,state,female_count,male_count
0,Amilavuho,NH,1,1.0


In [54]:
# return the city with the most male users
city_users.sort_values('male_count', ascending=False).head(1)

Unnamed: 0,city,state,female_count,male_count
0,Amilavuho,NH,1,1.0
