### DSCI 510 - Extra Credit
#### Analysis of California Community Colleges
- Feature/Regional Analysis
- Descriptive Map and Plot Visualizations

#### _Scrape Community College Admission Data From Wikipedia_: https://en.wikipedia.org/wiki/List_of_California_Community_Colleges

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
#Connect to Wiki Website
response = requests.get('https://en.wikipedia.org/wiki/List_of_California_Community_Colleges')
soup = BeautifulSoup(response.content, 'html.parser')
response.status_code

200

In [3]:
table = soup.find_all('table')[0]
td = table.find_all('td')


df = pd.DataFrame({0:[], 1:[], 2:[], 3:[], 4:[], 5:[]})
df.rename(columns =dict(enumerate([x.text.strip() for x in table.find_all('th')[1:]])), inplace=True)

start, end, cnt = 0, 6, 0
while end <= 685:
    ls=[]
    row_data = td[start:end]
    for val in row_data:
        ls.append(val.text.strip())
    df.loc[cnt] = ls
    cnt+=1
    start+=6
    end+=6

#### _Feed Colleges to API for Coordinates_ - _API Source_: Google Maps 

In [4]:
api_df = pd.DataFrame()
API_key = 'AIzaSyAWXRek5w7WA4qithObH-inNzYL8UYGC0U'
lat_ls, lon_ls, college_ls = [],[], []
for uni in df.College:
    link = f'https://maps.googleapis.com/maps/api/geocode/json?address={uni.replace(" ", "+")}+California&key={API_key}'
    response = requests.get(link)
    addy_data = response.json()
    lat_ls.append(addy_data['results'][0]['geometry']['location']['lat'])
    lon_ls.append(addy_data['results'][0]['geometry']['location']['lng'])
    college_ls.append(uni)

api_df['College'] = college_ls
api_df['Lat'] = lat_ls
api_df['Lon'] = lon_ls

In [5]:
df

Unnamed: 0,Ranking,College,Total enrollment,Full-time enrollment,Part-time enrollment,Founded
0,1,East Los Angeles College,36606,7090,29516,1945
1,2,Santa Monica College,29999,10720,19279,1929
2,3,American River College,29701,7560,22141,1955
3,4,Santa Ana College,28598,3435,25163,1915
4,5,Mount San Antonio College,28481,10499,17982,1946
...,...,...,...,...,...,...
109,110,College of the Siskiyous,2533,1002,1531,1957
110,111,Lassen College,2494,600,1894,1925
111,112,Lake Tahoe Community College,2426,765,1661,1975
112,113,Copper Mountain College,1783,750,1033,1966


In [67]:
api_df

Unnamed: 0,College,Lat,Lon
0,East Los Angeles College,34.041435,-118.150198
1,Santa Monica College,34.016781,-118.470693
2,American River College,38.651148,-121.346733
3,Santa Ana College,33.757971,-117.888884
4,Mount San Antonio College,34.048749,-117.842132
...,...,...,...
109,College of the Siskiyous,41.412171,-122.390292
110,Lassen College,40.431527,-120.632581
111,Lake Tahoe Community College,38.926651,-119.972576
112,Copper Mountain College,34.141060,-116.216934


#### _Write to SQL Schema_ 

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
import sqlite3
#Create Database
conn = sqlite3.connect('DS_ExtraCredit.db')
cur = conn.cursor()

cur.execute('PRAGMA foreign_keys = ON')

#Create Tables
cur.execute('DROP TABLE IF EXISTS cc_admissions')

cur.execute('CREATE TABLE cc_admissions \
            (Ranking INT, \
            College TEXT NOT NULL, \
            Total_Enrollment FLOAT, \
            Full_Enrollment FLOAT, \
            Part_Enrollment FLOAT, \
            Founded INT,\
            PRIMARY KEY(College))')

cur.execute('DROP TABLE IF EXISTS cc_coords')
cur.execute('CREATE TABLE cc_coords \
            (College TEXT NOT NULL,\
             Latitude FLOAT,\
             Longitude FLOAT,\
             PRIMARY KEY(College))')

conn.commit()

In [3]:
#Insert CC Admissions Data

#Connect to Wiki Website
response = requests.get('https://en.wikipedia.org/wiki/List_of_California_Community_Colleges')
soup = BeautifulSoup(response.content, 'html.parser')
print(response.status_code)
table = soup.find_all('table')[0]
td = table.find_all('td')

start, end, cnt = 0, 6, 0
colleges = []
while end <= 685:
    ls=[]
    row_data = td[start:end]
    for val in row_data:
        ls.append(val.text.strip())
    ls[0],ls[2], ls[3], ls[4], ls[5]  = int(ls[0]), float(ls[2].replace(',', '')),\
                                    float(ls[3].replace(',', '')),float(ls[4].replace(',', '')), int(ls[5])
    colleges.append(ls[1])

    cur.execute('INSERT INTO cc_admissions VALUES (?, ?, ?, ?, ?, ?)', tuple(ls))
    conn.commit()
    cnt+=1
    start+=6
    end+=6

200


In [4]:
#Insert CC Coordinate Data
API_key = 'AIzaSyAWXRek5w7WA4qithObH-inNzYL8UYGC0U'

for cc in colleges:
    data_lst = [] 
    link = f'https://maps.googleapis.com/maps/api/geocode/json?address={cc.replace(" ", "+")}+California&key={API_key}'
    response = requests.get(link)
    addy_data = response.json()
    data_lst.append(cc)
    data_lst.append(addy_data['results'][0]['geometry']['location']['lat'])
    data_lst.append(addy_data['results'][0]['geometry']['location']['lng'])
    
    cur.execute('INSERT INTO cc_coords VALUES(?, ?, ?)', tuple(data_lst))
    conn.commit()

In [5]:
cur.execute('SELECT * \
FROM cc_coords')
print(cur.fetchall())

[('East Los Angeles College', 34.0414348, -118.1501982), ('Santa Monica College', 34.0167814, -118.4706934), ('American River College', 38.6511479, -121.3467327), ('Santa Ana College', 33.7579713, -117.888884), ('Mount San Antonio College', 34.0487487, -117.8421321), ('City College of San Francisco', 37.72569, -122.4510797), ('Pasadena City College', 34.1439322, -118.1186338), ('Palomar College', 33.1512505, -117.1821885), ('Fullerton College', 33.8703645, -117.9242123), ('San Diego Mesa College', 32.715738, -117.1610838), ('Long Beach City College', 33.8335679, -118.1345668), ('El Camino Community College District', 33.8842409, -118.3302549), ('Sacramento City College', 37.3541079, -121.9552356), ('De Anza College', 37.3192806, -122.0447919), ('Santa Rosa Junior College', 38.45573, -122.7211551), ('Orange Coast College', 33.6713265, -117.9117079), ('Cerritos College', 33.8850876, -118.095832), ('Fresno City College', 36.7669209, -119.7979063), ('Saddleback College', 33.5538297, -117.6

In [6]:
#run this LAST TO CLOSE
cur.execute('SELECT MAX(Total_Enrollment)  FROM cc_admissions')
print(cur.fetchall())

cur.close()
conn.close()

[(36606.0,)]


#### _Relational Join Coordinate Data with Community College Admission Data_

In [45]:
#This is the Main (Merged) Dataset to perform Analysis on
cc_df = df.merge(api_df, how='inner', on='College')
cc_df.head()

Unnamed: 0,Ranking,College,Total enrollment,Full-time enrollment,Part-time enrollment,Founded,Lat,Lon
0,1,East Los Angeles College,36606,7090,29516,1945,34.041435,-118.150198
1,2,Santa Monica College,29999,10720,19279,1929,34.016781,-118.470693
2,3,American River College,29701,7560,22141,1955,38.651148,-121.346733
3,4,Santa Ana College,28598,3435,25163,1915,33.757971,-117.888884
4,5,Mount San Antonio College,28481,10499,17982,1946,34.048749,-117.842132


In [34]:
import requests 
API_key = 'AIzaSyAWXRek5w7WA4qithObH-inNzYL8UYGC0U'


city = 'Santa Monica College'
link = f'https://maps.googleapis.com/maps/api/geocode/json?address={city.replace(" ", "+")}&key={API_key}'
response = requests.get(link)
addy_data = response.json()

In [15]:
lat, lon = addy_data['results'][0]['geometry']['location']['lat'],addy_data['results'][0]['geometry']['location']['lng']

In [16]:
lat, lon

(34.0167814, -118.4706934)

In [46]:
import folium
map = folium.Map(location=[lat, lon], zoom_start=13)

for idx in range(len(api_df)):

# Add a marker at the coordinates
    marker = folium.Marker(location=[api_df['Lat'].iloc[idx], api_df['Lon'].iloc[idx]])
    marker.add_to(map)

# Display the map
map