### Exploring the US Cities Demographics

Source: [Opensoft](https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/)

File: us-cities-demographics.csv

This data comes from OpenSoft.
        

In [40]:
import pandas as pd
import numpy as np

pd.set_option("max_columns", 500)

In [2]:
fname = 'us-cities-demographics.csv'
df = pd.read_csv(fname, sep=';')
df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [3]:
## Finding the unique races

df.Race.value_counts()

Hispanic or Latino                   596
White                                589
Black or African-American            584
Asian                                583
American Indian and Alaska Native    539
Name: Race, dtype: int64

In [4]:
df.dtypes

City                       object
State                      object
Median Age                float64
Male Population           float64
Female Population         float64
Total Population            int64
Number of Veterans        float64
Foreign-born              float64
Average Household Size    float64
State Code                 object
Race                       object
Count                       int64
dtype: object

In [5]:
len(df)

2891

#### Using Pyspark

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [7]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [86]:
df = spark.read.options(delimiter=";", header="true", inferSchema='true')\
            .csv("us-cities-demographics.csv")

In [87]:
df.limit(5).toPandas()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601,41862,82463,1562,30908,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129,49500,93629,4147,32935,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040,46799,84839,4819,8229,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127,87105,175232,5821,33878,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040,143873,281913,5829,86253,2.73,NJ,White,76402


In [88]:
df = df.drop("Race_percentage_by_city")

In [89]:
df = df.withColumn("Race_percent_by_city", F.col("Count")*100/F.col("Total Population"))

In [90]:
df.limit(5).toPandas()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count,Race_percent_by_city
0,Silver Spring,Maryland,33.8,40601,41862,82463,1562,30908,2.6,MD,Hispanic or Latino,25924,31.437129
1,Quincy,Massachusetts,41.0,44129,49500,93629,4147,32935,2.39,MA,White,58723,62.718816
2,Hoover,Alabama,38.5,38040,46799,84839,4819,8229,2.58,AL,Asian,4759,5.609448
3,Rancho Cucamonga,California,34.5,88127,87105,175232,5821,33878,3.18,CA,Black or African-American,24437,13.945512
4,Newark,New Jersey,34.6,138040,143873,281913,5829,86253,2.73,NJ,White,76402,27.101269


In [91]:
df =  df.withColumn("Race_rank_by_city", \
                    F.dense_rank().over(Window.partitionBy("Race").orderBy(F.desc("Race_percent_by_city"))))

In [92]:
df.limit(10).toPandas()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count,Race_percent_by_city,Race_rank_by_city
0,East Orange,New Jersey,35.5,29995,34967,64962,1870,15164,2.85,NJ,Black or African-American,57859,89.065915,1
1,Jackson,Mississippi,31.7,79039,91772,170811,8146,1789,2.65,MS,Black or African-American,139567,81.708438,2
2,Detroit,Michigan,34.8,319265,357859,677124,29511,39861,2.6,MI,Black or African-American,545988,80.633385,3
3,Lauderhill,Florida,35.7,32813,38761,71574,1965,25471,3.02,FL,Black or African-American,56948,79.565205,4
4,Gary,Indiana,38.1,35876,41478,77354,3952,1884,2.35,IN,Black or African-American,61416,79.396023,5
5,Miami Gardens,Florida,34.9,50719,62480,113199,2327,33394,3.75,FL,Black or African-American,85300,75.354023,6
6,Albany,Georgia,33.3,31695,39414,71109,5409,861,2.38,GA,Black or African-American,53440,75.152231,7
7,Pine Hills,Florida,29.2,35344,40737,76081,2594,20735,3.3,FL,Black or African-American,57089,75.037131,8
8,Southfield,Michigan,41.6,31369,41808,73177,4035,4011,2.27,MI,Black or African-American,54200,74.066988,9
9,Birmingham,Alabama,35.6,102122,112789,214911,13212,8258,2.21,AL,Black or African-American,157985,73.511826,10


In [93]:
df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Median Age: double (nullable = true)
 |-- Male Population: integer (nullable = true)
 |-- Female Population: integer (nullable = true)
 |-- Total Population: integer (nullable = true)
 |-- Number of Veterans: integer (nullable = true)
 |-- Foreign-born: integer (nullable = true)
 |-- Average Household Size: double (nullable = true)
 |-- State Code: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- Race_percent_by_city: double (nullable = true)
 |-- Race_rank_by_city: integer (nullable = true)



In [94]:
# df.write.parquet('CitiesDemog',partitionBy=['state'])

In [95]:
cities = df.select(["City", "State"]).distinct().toPandas()

In [96]:
len(cities)

596

In [97]:
cities.head()

Unnamed: 0,City,State
0,Lynchburg,Virginia
1,Cincinnati,Ohio
2,Kansas City,Kansas
3,Dayton,Ohio
4,Auburn,Washington


In [98]:
cities.to_csv("Cities_in_demog.csv",index=False)

In [99]:
df.groupby("Race").agg({"*":"count"}).show()

+--------------------+--------+
|                Race|count(1)|
+--------------------+--------+
|Black or African-...|     584|
|  Hispanic or Latino|     596|
|               White|     589|
|               Asian|     583|
|American Indian a...|     539|
+--------------------+--------+



In [100]:
df.groupby(["City","State code", "Race"]).agg({"*":"count"}).filter(F.col("count(1)")>1).show()

+----+----------+----+--------+
|City|State code|Race|count(1)|
+----+----------+----+--------+
+----+----------+----+--------+



In [101]:
df.where(F.col("City")=="Springfield").show()

+-----------+-------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+--------------------+-----------------+
|       City|        State|Median Age|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|Average Household Size|State Code|                Race| Count|Race_percent_by_city|Race_rank_by_city|
+-----------+-------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+--------------------+-----------------+
|Springfield|Massachusetts|      31.8|          74744|            79592|          154336|              5723|       16226|                  2.81|        MA|Black or African-...| 35284|   22.86180800331744|              154|
|Springfield|     Illinois|      38.8|          55639|            62170|          117809|              7525|

### Decisions:

All the cities of a state are not available hence the computation of the % of race in a state is not done.

In [102]:
# df = df.withColumn("Race_percent_by_state", \
#                    F.sum("Count").over(Window.partitionBy(["State Code", "Race"]))*100/F.sum("Count").over(Window.partitionBy("State Code")))

In [103]:
# df.limit(20).toPandas()

In [104]:
# df =  df.withColumn("Race_rank_by_state", \
#                     F.dense_rank().over(Window.partitionBy("Race").orderBy(F.desc("Race_percent_by_state"))))

#### Loading the Airport codes data

In [118]:
df_apc =  spark.read.options(delimiter=",", header=True) \
                    .csv("Cleaned Data/I94_ports.csv")

In [119]:
df_apc.limit(5).toPandas()

Unnamed: 0,code,port,locality,province,territory
0,CLG,,Calgary,Alberta,Canada
1,EDA,,Edmonton,Alberta,Canada
2,YHC,,Hakai pass,British Columbia,Canada
3,HAL,,Halifax,Nova Scotia,Canada
4,MON,,Montreal,Quebec,Canada


    Merging both the dataframes by the city. state, and country

In [120]:
cond = [df.City==df_apc.locality, df.State==df_apc.province]
df_merged = df.join(F.broadcast(df_apc), cond, "inner")

In [121]:
df_merged.limit(5).toPandas()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count,Race_percent_by_city,Race_rank_by_city,code,port,locality,province,territory
0,Jackson,Mississippi,31.7,79039,91772,170811,8146,1789,2.65,MS,Black or African-American,139567,81.708438,2,JAN,Thompson Field - Jackson Airport,Jackson,Mississippi,Unites States
1,Detroit,Michigan,34.8,319265,357859,677124,29511,39861,2.6,MI,Black or African-American,545988,80.633385,3,DET,,Detroit,Michigan,Unites States
2,Gary,Indiana,38.1,35876,41478,77354,3952,1884,2.35,IN,Black or African-American,61416,79.396023,5,GAR,,Gary,Indiana,Unites States
3,Birmingham,Alabama,35.6,102122,112789,214911,13212,8258,2.21,AL,Black or African-American,157985,73.511826,10,BHX,,Birmingham,Alabama,Unites States
4,Memphis,Tennessee,34.1,312237,343523,655760,31189,43318,2.55,TN,Black or African-American,420983,64.197725,12,MEM,,Memphis,Tennessee,Unites States


In [122]:
df_merged.count()

749

In [123]:
df.count()

2891

In [124]:
df_merged.columns

['City',
 'State',
 'Median Age',
 'Male Population',
 'Female Population',
 'Total Population',
 'Number of Veterans',
 'Foreign-born',
 'Average Household Size',
 'State Code',
 'Race',
 'Count',
 'Race_percent_by_city',
 'Race_rank_by_city',
 'code',
 'port',
 'locality',
 'province',
 'territory']

In [127]:
df_merged.select("City", "State", "territory", "Median Age", "Male Population", "Female Population", "Total Population", \
         "Number of Veterans", "Foreign-born", "Average Household Size", "Race", "Count", "Race_percent_by_city",\
          "Race_rank_by_city", "code")

DataFrame[City: string, State: string, territory: string, Median Age: double, Male Population: int, Female Population: int, Total Population: int, Number of Veterans: int, Foreign-born: int, Average Household Size: double, Race: string, Count: int, Race_percent_by_city: double, Race_rank_by_city: int, code: string]

In [128]:
df_merged.limit(5).toPandas()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count,Race_percent_by_city,Race_rank_by_city,code,port,locality,province,territory
0,Jackson,Mississippi,31.7,79039,91772,170811,8146,1789,2.65,MS,Black or African-American,139567,81.708438,2,JAN,Thompson Field - Jackson Airport,Jackson,Mississippi,Unites States
1,Detroit,Michigan,34.8,319265,357859,677124,29511,39861,2.6,MI,Black or African-American,545988,80.633385,3,DET,,Detroit,Michigan,Unites States
2,Gary,Indiana,38.1,35876,41478,77354,3952,1884,2.35,IN,Black or African-American,61416,79.396023,5,GAR,,Gary,Indiana,Unites States
3,Birmingham,Alabama,35.6,102122,112789,214911,13212,8258,2.21,AL,Black or African-American,157985,73.511826,10,BHX,,Birmingham,Alabama,Unites States
4,Memphis,Tennessee,34.1,312237,343523,655760,31189,43318,2.55,TN,Black or African-American,420983,64.197725,12,MEM,,Memphis,Tennessee,Unites States
