# Segmenting and Clustering Neighborhoods in **Toronto** -  `*explore and cluster the neighborhoods in Toronto*` 

Transform the data on Wiki page into pandas dataframe
The dataframe will consists of the postal code of each neighborhood along with the borough name and neighborhood name

In [0]:
# Import Libraries

from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.request import urlopen

In [0]:

# Wiki url data from internet
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

source = urlopen(url).read().decode('utf-8')
page = BeautifulSoup(source, 'html.parser')
table = page.body.table.tbody

In [0]:
#functions for getting cell

def table_cell(i):
    cells = i.find_all('td')
    cell_data = []
    
    for cell in cells:
        if cell.a:            
            if (cell.a.text):
                cell_data.append(cell.a.text)
                continue
        cell_data.append(cell.string.strip())        
    return cell_data
  
#functions for getting row data
def table_row():    
    row_data = []  
    
    for tr in table.find_all('tr'):
        row = table_cell(tr)
        if len(row) != 3:
            continue
        row_data.append(row)   
    return row_data

In [0]:
#Creating pandas dataframe
wiki_data = table_row()
wiki_data_df = pd.DataFrame(wiki_data, columns=['Postcode', 'Borough', 'Neighbourhood'])
wiki_data_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# `***Cleaning the Data Frame***`


>  Ignore cells with a borough that is Not assigned



In [0]:
#dropping the "Not Assigned" 
clean_wiki_df = wiki_data_df[wiki_data_df.Borough != 'Not assigned']
clean_wiki_df = clean_wiki_df.sort_values(by=['Postcode','Borough'])

clean_wiki_df.reset_index(inplace=True)
clean_wiki_df.drop('index',axis=1,inplace=True)
clean_wiki_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union




> `More than one neighborhood can exist in one postal code area.Then two rows will be combined into one row with the neighborhoods separated with a comma `



In [0]:
#Grouping the neighbourhoods that share the postcode as comma separater

df_postcodes = clean_wiki_df['Postcode']
df_postcodes.drop_duplicates(inplace=True)
df2 = pd.DataFrame(df_postcodes)
df2['Borough'] = '';
df2['Neighbourhood'] = '';


df2.reset_index(inplace=True)
df2.drop('index', axis=1, inplace=True)
clean_wiki_df.reset_index(inplace=True)
clean_wiki_df.drop('index', axis=1, inplace=True)

for i in df2.index:
    for j in clean_wiki_df.index:
        if df2.iloc[i, 0] == clean_wiki_df.iloc[j, 0]:
            df2.iloc[i, 1] = clean_wiki_df.iloc[j, 1]
            df2.iloc[i, 2] = df2.iloc[i, 2] + ',' + clean_wiki_df.iloc[j, 2]
            
for i in df2.index:
    s = df2.iloc[i, 2]
    if s[0] == ',':
        s =s [1:]
    df2.iloc[i,2 ] = s
    
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae




> *`use the .shape method to print the number of rows of your dataframe.`*



In [0]:
# the number of rows in data frame
df2.shape





In [0]:
# Writing Data Frame into .CSV file to perform another assignment of this module

from google.colab import files
df2.to_csv('Toronto_DataFrame.csv', index=False, header=True) 
files.download('Toronto_DataFrame.csv')