## PEI Municipality Data Cleaning

Using data scraped from the Wikipedia article: [List of municipalities in Prince Edward Island](https://en.wikipedia.org/wiki/List_of_municipalities_in_Prince_Edward_Island)

Here we have cleaned it up for city name comparisons in other analysis.

In [1]:
# Dependencies.
import pandas as pd

In [2]:
# Dataset.
df = pd.read_csv('../resources/pei_muns.csv')
df.head()

Unnamed: 0,Name,County,Municipalstatus[3][6],Incorporationyear[15],Population(2016)[2][3],Population(2011)[2][16],Change,Land area(km²)[2][3],Populationdensity(ppl/km²)[2]
0,Charlottetown,Queens,City,1855[17][c],36094,34562,4.40%,44.34,814.0
1,Summerside,Prince,City,1877[d],14829,14751,0.50%,28.49,520.5
2,Alberton,Prince,Town,1913,1145,1135,0.90%,4.52,253.3
3,Borden-Carleton,Prince,Town,1995[e],724,750,−3.5%,12.99,55.7
4,Cornwall,Queens,Town,1995,5348,5162,3.60%,28.19,189.7


In [3]:
# Rename the columns.
df.columns = ['Name', 'County', 'Municipal Status', 'Incorporation Year', '2016 Population', '2011 Population', 'Population Change', 'Land Area (km2)', 'Population Density (ppl/km2)']
df.head()

Unnamed: 0,Name,County,Municipal Status,Incorporation Year,2016 Population,2011 Population,Population Change,Land Area (km2),Population Density (ppl/km2)
0,Charlottetown,Queens,City,1855[17][c],36094,34562,4.40%,44.34,814.0
1,Summerside,Prince,City,1877[d],14829,14751,0.50%,28.49,520.5
2,Alberton,Prince,Town,1913,1145,1135,0.90%,4.52,253.3
3,Borden-Carleton,Prince,Town,1995[e],724,750,−3.5%,12.99,55.7
4,Cornwall,Queens,Town,1995,5348,5162,3.60%,28.19,189.7


In [4]:
# Check County column.
df['County'].unique()

array(['Queens', 'Prince', 'Kings'], dtype=object)

In [5]:
# Check Municipal Status column.
df['Municipal Status'].unique()

array(['City', 'Town', 'Town[10]', 'Town[22]', 'Rural municipality',
       'Rural municipality[10]', 'Resort municipality'], dtype=object)

In [6]:
# Replace values.
df.replace({'Municipal Status': {'Town[10]': 'Town',
                                 'Town[22]': 'Town',
                                 'Rural municipality': 'Rural Municipality',
                                 'Rural municipality[10]': 'Rural Municipality',
                                 'Resort municipality': 'Resort Municipality'}}, inplace=True)
df['Municipal Status'].unique()

array(['City', 'Town', 'Rural Municipality', 'Resort Municipality'],
      dtype=object)

In [7]:
# Check Incorporation Year column.
df['Incorporation Year'].unique()

array(['1855[17][c]', '1877[d]', '1913', '1995[e]', '1995', '1914',
       '1954[f]', '1951[g]', '1910', '2018', '1952[j]', '1974', '1972',
       '1975', '2014', '1977', '1983[k]', '1991', '1950', '1983', '1955',
       '1982', '1973', '1968', '1957', '1953', '1964', '1966', '1951',
       '1985', '1959', '1986', '1990'], dtype=object)

In [8]:
# Pull the 4-digit year out.
df['Incorporation Year'] = df['Incorporation Year'].apply(lambda x: x[:4])
df['Incorporation Year'].unique()

array(['1855', '1877', '1913', '1995', '1914', '1954', '1951', '1910',
       '2018', '1952', '1974', '1972', '1975', '2014', '1977', '1983',
       '1991', '1950', '1955', '1982', '1973', '1968', '1957', '1953',
       '1964', '1966', '1985', '1959', '1986', '1990'], dtype=object)

In [9]:
# Change population columns into integers.
df['2016 Population'] = df['2016 Population'].str.replace('(\[\w*\])', '', regex=True)
df['2016 Population'] = df['2016 Population'].replace(',', '', regex=True).astype(int)
df['2011 Population'] = df['2011 Population'].str.replace('(\[\w*\])', '', regex=True)
df['2011 Population'] = df['2011 Population'].replace(',', '', regex=True).astype(int)
df.head()

Unnamed: 0,Name,County,Municipal Status,Incorporation Year,2016 Population,2011 Population,Population Change,Land Area (km2),Population Density (ppl/km2)
0,Charlottetown,Queens,City,1855,36094,34562,4.40%,44.34,814.0
1,Summerside,Prince,City,1877,14829,14751,0.50%,28.49,520.5
2,Alberton,Prince,Town,1913,1145,1135,0.90%,4.52,253.3
3,Borden-Carleton,Prince,Town,1995,724,750,−3.5%,12.99,55.7
4,Cornwall,Queens,Town,1995,5348,5162,3.60%,28.19,189.7


In [10]:
# Change percentages to floats.
df['Population Change'] = df['Population Change'].replace('%', '', regex=True)
# Hyphen issue - change to float-convertible one.
df['Population Change'] = df['Population Change'].replace('−', '-', regex=True).astype(float)
df.head()

Unnamed: 0,Name,County,Municipal Status,Incorporation Year,2016 Population,2011 Population,Population Change,Land Area (km2),Population Density (ppl/km2)
0,Charlottetown,Queens,City,1855,36094,34562,4.4,44.34,814.0
1,Summerside,Prince,City,1877,14829,14751,0.5,28.49,520.5
2,Alberton,Prince,Town,1913,1145,1135,0.9,4.52,253.3
3,Borden-Carleton,Prince,Town,1995,724,750,-3.5,12.99,55.7
4,Cornwall,Queens,Town,1995,5348,5162,3.6,28.19,189.7


In [11]:
# Save cleaned data.
df.to_csv('../resources/transformed_data/pei_cities_towns.csv')