In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
branch_info_df = pd.read_csv("../data/source_files/tpl-branch-general-information-2023.csv")
branch_info_df.info()              

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   _id                   112 non-null    int64  
 1   BranchCode            112 non-null    object 
 2   PhysicalBranch        112 non-null    int64  
 3   BranchName            112 non-null    object 
 4   Address               103 non-null    object 
 5   PostalCode            103 non-null    object 
 6   Website               108 non-null    object 
 7   Telephone             105 non-null    object 
 8   SquareFootage         110 non-null    object 
 9   PublicParking         100 non-null    object 
 10  KidsStop              100 non-null    float64
 11  LeadingReading        100 non-null    float64
 12  CLC                   100 non-null    float64
 13  DIH                   100 non-null    float64
 14  TeenCouncil           100 non-null    float64
 15  YouthHub              1

In [3]:
# Remove columns. Rename columns for better clarity. Remove branches with no physical location.

branch_info_df = branch_info_df.drop(columns=['_id','Address','PostalCode','Website','Telephone','WardNo','WardName'],axis=1)
branch_info_df.rename(columns={'CLC':'ComputerLearningCentre', 'DIH':'DigitalInnovationHub'}, inplace=True)
branch_info_df = branch_info_df[branch_info_df['PhysicalBranch'] != 0]
branch_info_df[['SquareFootage','KidsStop','LeadingReading','ComputerLearningCentre','DigitalInnovationHub','TeenCouncil','YouthHub','AdultLiteracyProgram','Workstations','NBHDNo','TPLNIA','PresentSiteYear']] = branch_info_df[['SquareFootage','KidsStop','LeadingReading','ComputerLearningCentre','DigitalInnovationHub','TeenCouncil','YouthHub','AdultLiteracyProgram','Workstations','NBHDNo','TPLNIA','PresentSiteYear']].astype('int64')

'''branch_info_df = branch_info_df[~branch_info_df['BranchCode'].isin(['AL', 'BKONE', 'BKTWO','DS','HLS','IL','LD','ME','OS','PR','SB','VIR'])]

def replace_null(value):
    if pd.isnull(value):
        return -1
    else:
        return value

branch_info_df = branch_info_df.applymap(replace_null)'''

branch_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 111
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   BranchCode              100 non-null    object 
 1   PhysicalBranch          100 non-null    int64  
 2   BranchName              100 non-null    object 
 3   SquareFootage           100 non-null    int64  
 4   PublicParking           100 non-null    object 
 5   KidsStop                100 non-null    int64  
 6   LeadingReading          100 non-null    int64  
 7   ComputerLearningCentre  100 non-null    int64  
 8   DigitalInnovationHub    100 non-null    int64  
 9   TeenCouncil             100 non-null    int64  
 10  YouthHub                100 non-null    int64  
 11  AdultLiteracyProgram    100 non-null    int64  
 12  Workstations            100 non-null    int64  
 13  ServiceTier             100 non-null    object 
 14  Lat                     100 non-null    float64

In [4]:
branch_info_df.head()

Unnamed: 0,BranchCode,PhysicalBranch,BranchName,SquareFootage,PublicParking,KidsStop,LeadingReading,ComputerLearningCentre,DigitalInnovationHub,TeenCouncil,YouthHub,AdultLiteracyProgram,Workstations,ServiceTier,Lat,Long,NBHDNo,NBHDName,TPLNIA,PresentSiteYear
0,AB,1,Albion,29000,59,1,1,1,1,1,1,1,38,DL,43.739826,-79.584096,2,Mount Olive-Silverstone-Jamestown,1,2017
1,ACD,1,Albert Campbell,28957,45,0,1,1,1,1,1,0,36,DL,43.708019,-79.269252,120,Clairlea-Birchmount,1,1971
2,AD,1,Alderwood,7341,shared,0,0,0,0,0,0,0,7,NL,43.601944,-79.547252,20,Alderwood,0,1999
3,AG,1,Agincourt,27000,86,0,1,1,1,0,1,0,42,DL,43.785167,-79.29343,118,Tam O'Shanter-Sullivan,0,1991
4,AH,1,Armour Heights,2988,shared,0,0,0,0,0,0,0,5,NL,43.739337,-79.421889,39,Bedford Park-Nortown,0,1982


In [5]:
branch_info_df.to_csv('../data/cleaned_files/tpl_branch_general_info_clean.csv', index=False)

In [6]:
branch_stats_df = branch_info_df.copy()

### Branch Visits

In [7]:
branch_visits = pd.read_csv("../data/source_files/tpl-visits-annual-by-branch-2012-2022.csv")
branch_visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1129 entries, 0 to 1128
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   _id         1129 non-null   int64 
 1   Year        1129 non-null   int64 
 2   BranchCode  1129 non-null   object
 3   Visits      1129 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 35.4+ KB


In [8]:
# Remove '_id' as it is not related to library data. 'BranchCode' is unique to each branch. 
branch_visits = branch_visits.drop('_id', axis=1)

# Remove branch codes which have no physical locations (these were in branch_info_df with PhysicalBranch=0)
branch_visits = branch_visits[~branch_visits['BranchCode'].isin(['AL', 'BKONE', 'BKTWO','DS','HLS','IL','LD','ME','OS','PR','SB','VIR'])]

branch_visits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1085 entries, 0 to 1128
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        1085 non-null   int64 
 1   BranchCode  1085 non-null   object
 2   Visits      1085 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 33.9+ KB


In [9]:
'''Creating separate columns for each year value instead of all years in one column'''
branch_visits = branch_visits.pivot_table(index='BranchCode', columns='Year', values='Visits', aggfunc='first').reset_index()

''' Based on data review, the blanks exist for periods where a library branch was closed to the public for 
repairs or renovations. Setting NaN to 0. '''
branch_visits = branch_visits.fillna(0)

branch_visits = pd.merge(branch_visits, branch_stats_df[['BranchCode', 'NBHDNo', 'NBHDName']], on='BranchCode', how='left')

branch_visits

Unnamed: 0,BranchCode,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,NBHDNo,NBHDName
0,AB,522309.0,432525.0,401819.0,194587.0,208251.0,260950.0,327977.0,337559.0,83718.0,58083.0,191320.0,2,Mount Olive-Silverstone-Jamestown
1,ACD,214076.0,222012.0,285428.0,277118.0,310244.0,356374.0,340522.0,229609.0,1607.0,12600.0,59881.0,120,Clairlea-Birchmount
2,AD,88065.0,96088.0,102875.0,101250.0,99865.0,97083.0,100886.0,100412.0,20644.0,19752.0,47658.0,20,Alderwood
3,AG,434320.0,451500.0,424457.0,397421.0,313157.0,326233.0,368837.0,356784.0,81923.0,81572.0,137685.0,118,Tam O'Shanter-Sullivan
4,AH,67184.0,68925.0,67675.0,67038.0,67355.0,73163.0,73829.0,72816.0,16702.0,14409.0,18416.0,39,Bedford Park-Nortown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,WP,35561.0,30575.0,37725.0,41450.0,54098.0,58297.0,58068.0,65661.0,21041.0,12922.0,28304.0,22,Humbermede
96,WS,488396.0,471800.0,439963.0,405225.0,441229.0,402862.0,373195.0,362674.0,71640.0,51362.0,163061.0,129,Agincourt North
97,WY,107079.0,114613.0,105900.0,116525.0,143043.0,133540.0,62229.0,0.0,0.0,0.0,37001.0,96,Casa Loma
98,YO,208985.0,191050.0,177013.0,157850.0,148942.0,152520.0,137585.0,134385.0,30268.0,25840.0,75390.0,95,Annex


In [10]:
# Saving csv file for all years
branch_visits.to_csv('../data/cleaned_files/tpl_branch_visits_2012_2022_clean.csv', index=False)

### Branch Registrations

In [11]:
branch_regs = pd.read_csv("../data/source_files/tpl-card-registrations-annual-by-branch-2012-2022.csv")
branch_regs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1169 entries, 0 to 1168
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   _id            1169 non-null   int64 
 1   Year           1169 non-null   int64 
 2   BranchCode     1169 non-null   object
 3   Registrations  1169 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 36.7+ KB


In [12]:
# Remove '_id' as it is not related to library data. 'BranchCode' is unique to each branch. 
branch_regs = branch_regs.drop('_id', axis=1)

# Remove branch codes which have no physical locations (these were in branch_info_df with PhysicalBranch=0)
branch_regs = branch_regs[~branch_regs['BranchCode'].isin(['AL', 'BKONE', 'BKTWO','DS','HLS','IL','LD','ME','OS','PR','SB','VIR'])]

branch_regs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1095 entries, 0 to 1168
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Year           1095 non-null   int64 
 1   BranchCode     1095 non-null   object
 2   Registrations  1095 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 34.2+ KB


In [13]:
'''Creating separate columns for each year value instead of all years in one column'''
branch_regs = branch_regs.pivot_table(index='BranchCode', columns='Year', values='Registrations', aggfunc='first').reset_index()

''' Based on data review, the blanks exist for periods where a library branch was closed to the public for 
repairs or renovations. Setting NaN to 0. '''
branch_regs = branch_regs.fillna(0)

branch_regs = pd.merge(branch_regs, branch_stats_df[['BranchCode', 'NBHDNo', 'NBHDName']], on='BranchCode', how='left')

branch_regs

Unnamed: 0,BranchCode,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,NBHDNo,NBHDName
0,AB,4939.0,4483.0,3810.0,4272.0,4474.0,6889.0,7788.0,7314.0,2072.0,1953.0,5093.0,2,Mount Olive-Silverstone-Jamestown
1,ACD,1695.0,1770.0,1782.0,1448.0,1592.0,1625.0,1789.0,1352.0,25.0,154.0,1577.0,120,Clairlea-Birchmount
2,AD,445.0,504.0,543.0,519.0,526.0,545.0,733.0,712.0,164.0,363.0,711.0,20,Alderwood
3,AG,3166.0,3586.0,3171.0,2919.0,2318.0,2538.0,3658.0,3404.0,1064.0,1194.0,2959.0,118,Tam O'Shanter-Sullivan
4,AH,467.0,501.0,608.0,545.0,567.0,645.0,859.0,842.0,194.0,233.0,398.0,39,Bedford Park-Nortown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,WP,515.0,647.0,495.0,426.0,480.0,506.0,573.0,784.0,252.0,190.0,413.0,22,Humbermede
96,WS,2013.0,2229.0,2149.0,2005.0,2000.0,2046.0,2387.0,2341.0,503.0,754.0,1788.0,129,Agincourt North
97,WY,845.0,1000.0,943.0,1032.0,1164.0,1101.0,579.0,17.0,4.0,18.0,1035.0,96,Casa Loma
98,YO,1042.0,1052.0,1011.0,1012.0,1003.0,1215.0,1296.0,1349.0,278.0,464.0,1081.0,95,Annex


In [14]:
# Saving csv file for all years
branch_regs.to_csv('../data/cleaned_files/tpl_branch_registrations_2012_2022_clean.csv', index=False)

### Branch Circulation

In [15]:
branch_circ = pd.read_csv("../data/source_files/tpl-circulation-annual-by-branch-2012-2022.csv")
branch_circ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   _id          1225 non-null   int64 
 1   Year         1225 non-null   int64 
 2   BranchCode   1225 non-null   object
 3   Circulation  1225 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 38.4+ KB


In [16]:
# Remove '_id' as it is not related to library data. 'BranchCode' is unique to each branch. 
branch_circ = branch_circ.drop('_id', axis=1)

# Remove branch codes which have no physical locations (these were in branch_info_df with PhysicalBranch=0)
branch_circ = branch_circ[~branch_circ['BranchCode'].isin(['AL', 'BKONE', 'BKTWO','DS','HLS','IL','LD','ME','OS','PR','SB','VIR'])]

branch_circ.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1095 entries, 0 to 1224
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Year         1095 non-null   int64 
 1   BranchCode   1095 non-null   object
 2   Circulation  1095 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 34.2+ KB


In [17]:
'''Creating separate columns for each year value instead of all years in one column'''
branch_circ = branch_circ.pivot_table(index='BranchCode', columns='Year', values='Circulation', aggfunc='first').reset_index()

''' Based on data review, the blanks exist for periods where a library branch was closed to the public for 
repairs or renovations. Setting NaN to 0. '''
branch_circ = branch_circ.fillna(0)

branch_circ = pd.merge(branch_circ, branch_stats_df[['BranchCode', 'NBHDNo', 'NBHDName']], on='BranchCode', how='left')

branch_circ

Unnamed: 0,BranchCode,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,NBHDNo,NBHDName
0,AB,515235.0,466776.0,398248.0,353764.0,300607.0,309631.0,328536.0,300374.0,126586.0,155096.0,209792.0,2,Mount Olive-Silverstone-Jamestown
1,ACD,446623.0,420341.0,398024.0,380452.0,360510.0,338066.0,303855.0,219561.0,12976.0,28134.0,84732.0,120,Clairlea-Birchmount
2,AD,171050.0,178588.0,176590.0,169022.0,164877.0,158583.0,155864.0,147747.0,36256.0,48728.0,91028.0,20,Alderwood
3,AG,1061437.0,1053430.0,934448.0,843672.0,712014.0,621226.0,623148.0,562590.0,266625.0,326570.0,394133.0,118,Tam O'Shanter-Sullivan
4,AH,204490.0,207404.0,192068.0,198247.0,193515.0,202628.0,214013.0,213427.0,64139.0,105018.0,69713.0,39,Bedford Park-Nortown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,WP,90311.0,87524.0,79759.0,77281.0,69884.0,67938.0,71530.0,60941.0,28476.0,38902.0,35084.0,22,Humbermede
96,WS,542341.0,520729.0,457886.0,412778.0,387621.0,340490.0,312597.0,290873.0,78857.0,81545.0,176330.0,129,Agincourt North
97,WY,243739.0,239247.0,208312.0,235971.0,242999.0,232975.0,115280.0,14190.0,5174.0,3768.0,47953.0,96,Casa Loma
98,YO,327149.0,325299.0,324428.0,319571.0,302099.0,286724.0,255993.0,251808.0,70318.0,65723.0,142124.0,95,Annex


In [18]:
# Saving csv file for all years
branch_circ.to_csv('../data/cleaned_files/tpl_branch_circulation_2012_2022_clean.csv', index=False)

### Branch Workstation Usage

In [19]:
branch_wrkstn_use_1 = pd.read_csv("../data/source_files/tpl-workstation-usage-annual-by-branch-2012-2017.csv")
branch_wrkstn_use_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   _id         598 non-null    int64 
 1   Year        598 non-null    int64 
 2   BranchCode  598 non-null    object
 3   Sessions    598 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 18.8+ KB


In [20]:
branch_wrkstn_use_2 = pd.read_csv("../data/source_files/tpl-workstation-usage-annual-by-branch-2018-2022.csv")
branch_wrkstn_use_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   _id         492 non-null    int64 
 1   Year        492 non-null    int64 
 2   BranchCode  492 non-null    object
 3   Sessions    492 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 15.5+ KB


In [21]:
branch_wrkstn_all = pd.concat([branch_wrkstn_use_1, branch_wrkstn_use_2])

In [22]:
# Remove '_id' as it is not related to library data. 'BranchCode' is unique to each branch. 
branch_wrkstn_all = branch_wrkstn_all.drop('_id', axis=1)

# Remove branch codes which have no physical locations (these were in branch_info_df with PhysicalBranch=0)
branch_wrkstn_all = branch_wrkstn_all[~branch_wrkstn_all['BranchCode'].isin(['AL', 'BKONE', 'BKTWO','DS','HLS','IL','LD','ME','OS','PR','SB','VIR'])]

branch_wrkstn_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1081 entries, 0 to 491
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        1081 non-null   int64 
 1   BranchCode  1081 non-null   object
 2   Sessions    1081 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 33.8+ KB


In [23]:
'''Creating separate columns for each year value instead of all years in one column'''
branch_wrkstn_all = branch_wrkstn_all.pivot_table(index='BranchCode', columns='Year', values='Sessions', aggfunc='first').reset_index()

''' Based on data review, the blanks exist for periods where a library branch was closed to the public for 
repairs or renovations. Setting NaN to 0. '''
branch_wrkstn_all = branch_wrkstn_all.fillna(0)

branch_wrkstn_all = pd.merge(branch_wrkstn_all, branch_stats_df[['BranchCode', 'NBHDNo', 'NBHDName']], on='BranchCode', how='left')

branch_wrkstn_all

Unnamed: 0,BranchCode,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,NBHDNo,NBHDName
0,AB,132024.0,139402.0,142159.0,142187.0,140606.0,137999.0,111939.0,105936.0,26076.0,14154.0,44967.0,2,Mount Olive-Silverstone-Jamestown
1,ACD,131353.0,133591.0,149898.0,145984.0,149712.0,150308.0,82784.0,54592.0,0.0,0.0,14030.0,120,Clairlea-Birchmount
2,AD,16279.0,16238.0,16500.0,16700.0,16338.0,15213.0,12162.0,10518.0,1892.0,504.0,2587.0,20,Alderwood
3,AG,140407.0,180672.0,166023.0,158536.0,103406.0,134675.0,112149.0,110006.0,25441.0,10508.0,35212.0,118,Tam O'Shanter-Sullivan
4,AH,11895.0,11250.0,11800.0,11013.0,10988.0,11225.0,9753.0,9414.0,1811.0,1044.0,1852.0,39,Bedford Park-Nortown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,WP,19629.0,16738.0,19200.0,21425.0,24763.0,23938.0,13555.0,14657.0,3980.0,1409.0,4115.0,22,Humbermede
96,WS,105207.0,114088.0,104775.0,106963.0,101650.0,97263.0,75752.0,72288.0,13434.0,5878.0,21028.0,129,Agincourt North
97,WY,44020.0,43200.0,37747.0,37263.0,36875.0,36463.0,16140.0,0.0,0.0,0.0,3836.0,96,Casa Loma
98,YO,32464.0,32475.0,32850.0,32313.0,29425.0,29238.0,22390.0,21130.0,4700.0,1929.0,8755.0,95,Annex


In [24]:
# Saving csv file for all years
branch_wrkstn_all.to_csv('../data/cleaned_files/tpl_branch_workstation_use_2012_2022_clean.csv', index=False)

### Neighbourhood Crime Rates

In [25]:
nbhd_crime = pd.read_csv("../data/source_files/neighbourhood-crime-rates - 2952.csv")
nbhd_crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Columns: 185 entries, _id to geometry
dtypes: float64(127), int64(56), object(2)
memory usage: 228.5+ KB


In [26]:
nbhd_crime = nbhd_crime.loc[nbhd_crime['_id'] <= 158]

# Remove '_id' as it is not related to library data. 'BranchCode' is unique to each branch. 
nbhd_crime = nbhd_crime.drop(columns=['_id', 'geometry'],axis=1)

nbhd_crime.rename(columns={'AREA_NAME':'NBHDName', 'HOOD_ID':'NBHDNo'}, inplace=True)

nbhd_crime = nbhd_crime.drop(columns=['ASSAULT_2014','ASSAULT_2015','ASSAULT_2016','ASSAULT_2017','ASSAULT_2018','ASSAULT_2019','ASSAULT_2020','ASSAULT_2021','ASSAULT_2022','ASSAULT_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['AUTOTHEFT_2014','AUTOTHEFT_2015','AUTOTHEFT_2016','AUTOTHEFT_2017','AUTOTHEFT_2018','AUTOTHEFT_2019','AUTOTHEFT_2020','AUTOTHEFT_2021','AUTOTHEFT_2022','AUTOTHEFT_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['BIKETHEFT_2014','BIKETHEFT_2015','BIKETHEFT_2016','BIKETHEFT_2017','BIKETHEFT_2018','BIKETHEFT_2019','BIKETHEFT_2020','BIKETHEFT_2021','BIKETHEFT_2022','BIKETHEFT_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['BREAKENTER_2014','BREAKENTER_2015','BREAKENTER_2016','BREAKENTER_2017','BREAKENTER_2018','BREAKENTER_2019','BREAKENTER_2020','BREAKENTER_2021','BREAKENTER_2022','BREAKENTER_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['HOMICIDE_2014','HOMICIDE_2015','HOMICIDE_2016','HOMICIDE_2017','HOMICIDE_2018','HOMICIDE_2019','HOMICIDE_2020','HOMICIDE_2021','HOMICIDE_2022','HOMICIDE_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['ROBBERY_2014','ROBBERY_2015','ROBBERY_2016','ROBBERY_2017','ROBBERY_2018','ROBBERY_2019','ROBBERY_2020','ROBBERY_2021','ROBBERY_2022','ROBBERY_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['SHOOTING_2014','SHOOTING_2015','SHOOTING_2016','SHOOTING_2017','SHOOTING_2018','SHOOTING_2019','SHOOTING_2020','SHOOTING_2021','SHOOTING_2022','SHOOTING_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['THEFTFROMMV_2014','THEFTFROMMV_2015','THEFTFROMMV_2016','THEFTFROMMV_2017','THEFTFROMMV_2018','THEFTFROMMV_2019','THEFTFROMMV_2020','THEFTFROMMV_2021','THEFTFROMMV_2022','THEFTFROMMV_2023'],axis=1)

nbhd_crime = nbhd_crime.drop(columns=['THEFTOVER_2014','THEFTOVER_2015','THEFTOVER_2016','THEFTOVER_2017','THEFTOVER_2018','THEFTOVER_2019','THEFTOVER_2020','THEFTOVER_2021','THEFTOVER_2022','THEFTOVER_2023'],axis=1)

nbhd_crime = nbhd_crime.fillna(0)

nbhd_crime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 158 entries, 0 to 157
Data columns (total 93 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   NBHDName               158 non-null    object 
 1   NBHDNo                 158 non-null    int64  
 2   POPULATION_2023        158 non-null    int64  
 3   ASSAULT_RATE_2014      158 non-null    float64
 4   ASSAULT_RATE_2015      158 non-null    float64
 5   ASSAULT_RATE_2016      158 non-null    float64
 6   ASSAULT_RATE_2017      158 non-null    float64
 7   ASSAULT_RATE_2018      158 non-null    float64
 8   ASSAULT_RATE_2019      158 non-null    float64
 9   ASSAULT_RATE_2020      158 non-null    float64
 10  ASSAULT_RATE_2021      158 non-null    float64
 11  ASSAULT_RATE_2022      158 non-null    float64
 12  ASSAULT_RATE_2023      158 non-null    float64
 13  AUTOTHEFT_RATE_2014    158 non-null    float64
 14  AUTOTHEFT_RATE_2015    158 non-null    float64
 15  AUTOTHEFT_R

In [27]:
# Saving csv file for all years
nbhd_crime.to_csv('../data/cleaned_files/neighbourhood_crime_2014_2023_clean.csv', index=False)

NO LONGER NEEDED IF GEOMETRY REMOVED. Inspection of csv revealed need for further manual deletion in Microsoft Excel. There were 165 rows (header excluded). File named 'neighbourhood_crime_2014_2023_clean_manual.csv'.

### Neighbourhood Median Income

In [28]:
nbhd_median_income = pd.read_csv("../data/source_files/neighbourhood-profiles-2021-158-model-median-inc-2020.csv")
nbhd_median_income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 5 columns):
 #   Column                                                              Non-Null Count  Dtype 
---  ------                                                              --------------  ----- 
 0   Neighbourhood Name                                                  158 non-null    object
 1   Neighbourhood Number                                                158 non-null    int64 
 2   TSNS 2020 Designation                                               158 non-null    object
 3   Total - Income statistics for private households - 25% sample data  158 non-null    int64 
 4     Median total income of household in 2020 ($)                      158 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 6.3+ KB


In [29]:
nbhd_median_income.rename(columns={'Neighbourhood Name':'NBHDName', 'Neighbourhood Number':'NBHDNo'}, inplace=True)

nbhd_median_income

Unnamed: 0,NBHDName,NBHDNo,TSNS 2020 Designation,Total - Income statistics for private households - 25% sample data,Median total income of household in 2020 ($)
0,West Humber-Clairville,1,Not an NIA or Emerging Neighbourhood,10700,92000
1,Mount Olive-Silverstone-Jamestown,2,Neighbourhood Improvement Area,9740,76500
2,Thistletown-Beaumond Heights,3,Neighbourhood Improvement Area,3245,86000
3,Rexdale-Kipling,4,Not an NIA or Emerging Neighbourhood,3945,77000
4,Elms-Old Rexdale,5,Neighbourhood Improvement Area,3190,82000
...,...,...,...,...,...
153,Yonge-Bay Corridor,170,Not an NIA or Emerging Neighbourhood,7465,68500
154,Junction-Wallace Emerson,171,Not an NIA or Emerging Neighbourhood,10185,84000
155,Dovercourt Village,172,Not an NIA or Emerging Neighbourhood,5310,87000
156,North Toronto,173,Not an NIA or Emerging Neighbourhood,9430,70000


In [30]:
# Saving csv file for all years
nbhd_median_income.to_csv('../data/cleaned_files/neighbourhood_median_income_2020_clean.csv', index=False)

### Neighbourhood Overview

In [None]:
NBHD_Overview = branch_info_df[['NBHDNo', 'NBHDName', 'TPLNIA']].copy()



 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   BranchCode              100 non-null    object 
 1   PhysicalBranch          100 non-null    int64  
 2   BranchName              100 non-null    object 
 3   SquareFootage           100 non-null    int64  
 4   PublicParking           100 non-null    object 
 5   KidsStop                100 non-null    int64  
 6   LeadingReading          100 non-null    int64  
 7   ComputerLearningCentre  100 non-null    int64  
 8   DigitalInnovationHub    100 non-null    int64  
 9   TeenCouncil             100 non-null    int64  
 10  YouthHub                100 non-null    int64  
 11  AdultLiteracyProgram    100 non-null    int64  
 12  Workstations            100 non-null    int64  
 13  ServiceTier             100 non-null    object 
 14  Lat                     100 non-null    float64
 15  Long                    100 non-null    float64
 16  NBHDNo                  100 non-null    int64  
 17  NBHDName                100 non-null    object 
 18  TPLNIA                  100 non-null    int64  
 19  PresentSiteYear         100 non-null    int64 