In [3]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Get Random States

In [125]:
# Hawaii and Alaska not included
# Lists based on image below
northern_states = pd.Series(["Washington", "Oregon", "Idaho", "Montana", "Wyoming", "North-Dakota", "South-Dakota", "Nebraska", "Iowa", "Minnesota", "Wisconsin", "Illinois", "Indiana", "Michigan", "Ohio", "Pennsylvania", "New-York", "Vermont", "New-Hampshire", "Maine", "Massachusetts", "Rhode-Island", "Connecticut", "New-Jersey"])
southern_states = pd.Series(["California", "Nevada", "Utah", "Arizona", "Colorado", "New-Mexico", "Kansas", "Oklahoma", "Texas", "Missouri", "Arkansas", "Louisiana", "Kentucky", "Tennessee", "Mississippi", "Alabama", "Georgia", "Florida", "West-Virginia", "Virginia", "North-Carolina", "South-Carolina", "Delaware", "Maryland"])

24

![US Boundaries](https://upload.wikimedia.org/wikipedia/commons/8/86/Northern_and_Southern_States_on_United_States_of_America_Map.png)

In [83]:
north_sample = northern_states.sample(100, random_state=500, replace=True)
south_sample = southern_states.sample(100, random_state=500, replace=True)

## Get Random Cities

In [84]:
cities = pd.read_csv("uscities.csv")
cities = cities[["city", "state_name", "lat", "lng"]]
cities["state_name"] = cities["state_name"].str.replace(" ", "-")
cities.head()

Unnamed: 0,city,state_name,lat,lng
0,New York,New-York,40.6943,-73.9249
1,Los Angeles,California,34.1139,-118.4068
2,Chicago,Illinois,41.8373,-87.6862
3,Miami,Florida,25.7839,-80.2102
4,Dallas,Texas,32.7936,-96.7662


In [85]:
def get_random_city(state):
    # get cities from only the provided state
    sorted_cities = cities.sort_values("state_name")
    idx = sorted_cities.index[sorted_cities["state_name"] == state]
    new_cities = sorted_cities.loc[idx]
    city_df = new_cities.sample(1).reset_index(drop=True)
    
    return city_df

In [86]:
get_random_city("Iowa")

Unnamed: 0,city,state_name,lat,lng
0,Bettendorf,Iowa,41.5656,-90.4764


In [97]:
random_north_cities = pd.DataFrame()
random_south_cities = pd.DataFrame()

for state in north_sample:
    random_city = get_random_city(state)
    random_north_cities = random_north_cities.append(random_city, ignore_index=True)
    
for state in south_sample:
    random_city = get_random_city(state)
    random_south_cities = random_south_cities.append(random_city, ignore_index=True)
    
random_north_cities["city"] = random_north_cities["city"].str.replace(" ", "-")   
random_south_cities["city"] = random_south_cities["city"].str.replace(" ", "-")   

random_north_cities[0:10]

Unnamed: 0,city,state_name,lat,lng
0,South-Plainfield,New-Jersey,40.5748,-74.4153
1,Ontario,Oregon,44.0259,-116.9759
2,Newport,Vermont,44.9375,-72.2088
3,Chardon,Ohio,41.5803,-81.2082
4,Knoxville,Iowa,41.3188,-93.1024
5,Burr,Nebraska,40.536,-96.2997
6,Huntington-Woods,Michigan,42.4816,-83.1685
7,Kalamazoo,Michigan,42.2749,-85.5882
8,Lyndonville,Vermont,44.5352,-72.0016
9,Old-Bennington,Vermont,42.8848,-73.2143


## Scrape Data from https://city-data.com

In [106]:
def get_income(city_df):
    incomes = pd.Series(dtype="string")
    for row in city_df.iterrows():
        city = row[1].city
        state = row[1].state_name
        
        url = f"http://www.city-data.com/city/{city}-{state}.html"
        page = requests.get(url)
        pageHTML = page.text

        soup = BeautifulSoup(pageHTML, "html.parser")
        try:
            mean_income = soup.find_all(class_="median-income")[0].find_all(class_="hgraph")[0].find("table").find("tr").find_all("td")[1].get_text()
            incomes = incomes.append(pd.Series([mean_income]))   
        except:
            tempIncomes = incomes.str.replace("$", "", regex=False)
            tempIncomes = tempIncomes.str.replace(",", "", regex=False)
            tempIncomes = tempIncomes.str.replace("over ", "", regex=False)
            tempIncomes = pd.to_numeric(tempIncomes, errors='coerce')
            mean_income = np.mean(tempIncomes)
            incomes = incomes.append(pd.Series([str(int(mean_income))]))
        
    incomes = incomes.reset_index(drop=True)
    return incomes

In [107]:
random_north_cities = random_north_cities.reset_index(drop=True)
random_north_cities["income"] = get_income(random_north_cities)
random_south_cities = random_south_cities.reset_index(drop=True)
random_south_cities["income"] = get_income(random_south_cities)

random_north_cities.head(10)

Unnamed: 0,city,state_name,lat,lng,income,color
0,South-Plainfield,New-Jersey,40.5748,-74.4153,"$105,972",green
1,Ontario,Oregon,44.0259,-116.9759,"$39,414",green
2,Newport,Vermont,44.9375,-72.2088,"$38,122",green
3,Chardon,Ohio,41.5803,-81.2082,"$61,367",green
4,Knoxville,Iowa,41.3188,-93.1024,61218,green
5,Burr,Nebraska,40.536,-96.2997,"$35,468",green
6,Huntington-Woods,Michigan,42.4816,-83.1685,"$138,883",green
7,Kalamazoo,Michigan,42.2749,-85.5882,"$39,494",green
8,Lyndonville,Vermont,44.5352,-72.0016,"$34,107",green
9,Old-Bennington,Vermont,42.8848,-73.2143,"$127,285",green


## Calculate Means and Standard Deviations

In [110]:
random_north_cities["income"] = random_north_cities["income"].str.replace("$", "", regex=False)
random_north_cities["income"] = random_north_cities["income"].str.replace(",", "", regex=False)
random_north_cities["income"] = random_north_cities["income"].str.replace("over ", "", regex=False)
random_north_cities["income"] = pd.to_numeric(random_north_cities["income"], errors='coerce')

random_south_cities["income"] = random_south_cities["income"].str.replace("$", "", regex=False)
random_south_cities["income"] = random_south_cities["income"].str.replace(",", "", regex=False)
random_south_cities["income"] = random_south_cities["income"].str.replace("over ", "", regex=False)
random_south_cities["income"] = pd.to_numeric(random_south_cities["income"], errors='coerce')

In [112]:
# ddof=1 gets the sample's standard deviation instead of the population's

random_north_cities_mean = np.mean(random_north_cities.income)
random_north_cities_stdev = np.std(random_north_cities.income, ddof=1)

random_south_cities_mean = np.mean(random_south_cities.income)
random_south_cities_stdev = np.std(random_south_cities.income, ddof=1)

print("North Mean: " + str(random_north_cities_mean))
print("North Standard Deviation: " + str(random_north_cities_stdev))
print("South Mean: " + str(random_south_cities_mean))
print("North Standard Deviation: " + str(random_south_cities_stdev))

North Mean: 64674.95
North Standard Deviation: 25811.285856836606
South Mean: 59262.31
North Standard Deviation: 35648.10162276484


## Visualization

In [None]:
import plotly.graph_objects as go

random_north_cities["color"] = "green"
random_south_cities["color"] = "red"

df = pd.concat([random_north_cities, random_south_cities])

fig = go.Figure(data=go.Scattergeo(
        lon = df['lng'],
        lat = df['lat'],
        text = df['city'] + ", " + df["state_name"],
        mode = 'markers',
        marker_color=df["color"]
        ))

fig.update_layout(
        title = '100 random cities from both The North and The South)',
        geo_scope='usa',
    )
fig.show()

## Download

In [123]:
random_north_cities[["city", "state_name", "income", "lat", "lng"]].to_csv("north.csv", index=False)
random_south_cities[["city", "state_name", "income", "lat", "lng"]].to_csv("south.csv", index=False)