# Neighborhoods in Toronto City - Part2
--------

### A. Creating data table and data pre-processing

In [13]:
import numpy as np
import pandas as pd 
import requests # First I need to fetch the link
data_link = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

from bs4 import BeautifulSoup
soup = BeautifulSoup(data_link,'lxml')
#print(soup.prettify()) #Print index page of the link

toronto_table = soup.find('table', {'class':'wikitable sortable'})
#toronto_table #I can print our requested table of the project
table_rows = toronto_table.find_all('tr')

#Turn our table into pandas data frame
table = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    table.append(row)
df=pd.DataFrame(table, columns=["PostalCode", "Borough", "Neighborhood"])
df.head(5)

#Let's do some data pre-processing
df=df[df.Borough!='Not assigned'] #Drop 'Not assigned' cells from Borough
df=df.drop(df.index[0])
df1=df.reset_index(drop=True)
df1['Neighborhood']=df1.Neighborhood.str.replace('\n','')
  
df1=df1.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(", ".join).reset_index() #Group dublicated PostalCodes and get the sum of Neighborhood

df1.Neighborhood = np.where(df1.Neighborhood=='Not assigned', df1.Borough, df1.Neighborhood) #Assign the Borough value where Neighborhood is 'Not assigned'

df1.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


------------
### A.1. Summarize of the 1st part workflow
1. I used numpy, pandas and request packages in above part. I also use BeautifulSoup to fetch the link and then I created pandas Data Frame from Wikipedia which consist of three columns: PostalCode, Borough, and Neighborhood
2. In first step of data-preprocessing I dropped 'Not assigned' part from Borough then I grouped dublicated variables of PostalCode and get sum of the values with seperated comma in Neighborhood column.
3. I assigned Borough value where the Neighborhood is 'Not assigned'
4. In last, you can see head of my data and the shape of it (103,3)

### B. Latitude & Longitude Informations of Postal Codes

Geocoder gets too many trouble in my case, so I used the .csv file


In [14]:
postalcode_url = "http://cocl.us/Geospatial_data"
geo_info = pd.read_csv(postalcode_url)
geo_info.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging Latitude and Longitude information to df1 data table as the Primary Key for 'PostalCode'

In [15]:
df1['Latitude'] = np.where(geo_info['Postal Code'] == df1.PostalCode, geo_info.Latitude, "NA").astype(float)
df1['Longitude'] = np.where(geo_info['Postal Code'] == df1.PostalCode, geo_info.Longitude, "NA").astype(float)
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [22]:
Toronto = df1.Borough.str.contains("Toronto")

df1[Toronto].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
