# Data Wrangling

In [1]:
import pandas as pd
import numpy as np

Reading dataframe from Wikipedia

In [2]:
df = pd.read_html(io='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


Drop Rows with borough value that equals not assinged

In [3]:
df = df[df['Borough']!='Not assigned']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


Combine neighborhoods that have the same postal code

In [4]:
postcodes = list(df['Postcode'].unique()) #get unique postcodes
for postcode in postcodes:
    neighborhoods = df[df['Postcode'] == postcode].loc[:,'Neighbourhood'] #Get nighborhoods' names
    indices = df[df['Postcode'] == postcode].index #get indices of the neigbhorhoods
    names = list(neighborhoods) #Generate a list of name
    finalName = names[0]
    for i in range(1, len(names)):
        finalName = finalName + ', ' + names[i] #Concatenate neighborhoods' names
    df.loc[indices, 'Neighbourhood'] = finalName #Replace name of Neighborhoods with final concatenated names
    
df.drop_duplicates(subset = 'Postcode', keep='first', inplace=True) #Drop duplicate rows with same Postcode value
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Not assigned
...,...,...,...
255,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
262,M4Y,Downtown Toronto,Church and Wellesley
265,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
266,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


Assign Borough name for neighborhoods with no name

In [5]:
indices = df[df['Neighbourhood']=='Not assigned'].index #get indices
df.loc[indices, 'Neighbourhood'] = df.loc[indices, 'Borough'] #Set borough name for neighborhood name
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


Print Shape

In [6]:
df.shape

(103, 3)

In [7]:
df.to_csv('Torono Neighborhoods.csv')