## Import Packages

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import geopandas as gpd
from scipy.stats import chi2_contingency, norm, skew
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Set Functions

In [2]:
#Function for displaying details around the numerical data that need to be coverted to bins

def binning_details(column):
    
    fig, ax = plt.subplots(figsize=[10,6])
    gdf[column].plot(kind='hist')
    plt.title(column);
    print(gdf[column].describe())  
     
    
#Function to perform a chi-squared test of independence

def chi2_test_of_independence(dataframe,col1,col2):
    
    contingency= pd.crosstab(dataframe[col1], dataframe[col2]) #Converts columns to contingency table
    
    c, p, dof, expected = chi2_contingency(contingency) # Chi-square test of independence.
    
    
    print()
    
    if p <= 0.05:
        print(f"The chi-squared test resulted in a critical value of {c} with a p value of {p}.")
        print(f"It is likely that {col1} and {col2} are associated, and should be included in the prediction model (reject H0)")
    
    else:
        print(f"The chi-squared test resulted in a critical value of {c} with a p value of {p}.")
        print(f"It is likely that {col1} and {col2} are independent, and should be removed from the prediction model (fail to reject H0)")

## Read In File and Convert To Geopandas Dataframe

In [3]:
geocode_output = pd.read_excel('data/combined_files_output.xlsx', index_col=0) #First have to read in as a pandas 

CRS = "EPSG:4326"

#Create a geodataframe from the output using the points_from_xy command

gdf = gpd.GeoDataFrame(geocode_output, geometry=gpd.points_from_xy(geocode_output.Longitude, geocode_output.Latitude))

gdf.head()

Unnamed: 0,Status,Number_of_Properties,Consent,Minor Variance,Official Plan Rezoning,Site Plan Application,Latitude,Longitude,geometry,Council,Neighbourhood,Population_Density,Households_In_Core_Housing_Need,Average_Income,Secondary_Plan,Business_Improvement_Area,Site_and_Area_Specific_Policy,Zoning_Category
0,Approved,1,0,0,1,0,43.626854,-79.499656,POINT (-79.49966 43.62685),Etobicoke York Community Council,Stonegate-Queensway,3199,1715,64140,0,1,0,Commercial Residential
1,Approved,1,0,0,0,1,43.674647,-79.400973,POINT (-79.40097 43.67465),Toronto and East York Community Council,Annex,10863,2420,112766,0,0,0,Commercial Residential
2,Approved,1,1,0,0,0,43.652096,-79.547083,POINT (-79.54708 43.65210),Etobicoke York Community Council,Islington-City Centre West,2712,3470,52787,0,0,0,Residential
3,Approved,1,0,1,0,0,43.753666,-79.338064,POINT (-79.33806 43.75367),North York Community Council,Parkwoods-Donalda,4691,3255,42516,0,0,0,Residential
4,Approved,1,0,0,0,1,43.735174,-79.342838,POINT (-79.34284 43.73517),North York Community Council,Banbury-Don Mills,2775,2115,67757,1,0,0,Residential


## 1. Visualizing Applications On A Map of Toronto

## 2. Visualizing The Various Features

## 3. Statistical Tests

In [4]:
#Logistic regression test to see whether the numerical features would make for good features in predicting application status

In [5]:
# Chi-square test of association between categorical variables and application status

cols_to_test = ['Consent',	'Minor Variance',	'Official Plan Rezoning', 'Site Plan Application',
'Council','Secondary_Plan','Business_Improvement_Area','Site_and_Area_Specific_Policy', 'Zoning_Category']

for col in cols_to_test:
    print(f"{col}")
    print(' — — — — — — — — ')
    chi2_test_of_independence(gdf,col,'Status')
    print()

Consent
 — — — — — — — — 

The chi-squared test resulted in a critical value of 49.47257806662523 with a p value of 2.011639977210845e-12.
It is likely that Consent and Status are associated, and should be included in the prediction model (reject H0)

Minor Variance
 — — — — — — — — 

The chi-squared test resulted in a critical value of 9.451353094282993 with a p value of 0.0021099339000612708.
It is likely that Minor Variance and Status are associated, and should be included in the prediction model (reject H0)

Official Plan Rezoning
 — — — — — — — — 

The chi-squared test resulted in a critical value of 110.82553094720825 with a p value of 6.461418263678712e-26.
It is likely that Official Plan Rezoning and Status are associated, and should be included in the prediction model (reject H0)

Site Plan Application
 — — — — — — — — 

The chi-squared test resulted in a critical value of 24.13596761042332 with a p value of 8.976789343706977e-07.
It is likely that Site Plan Application and St

## 4. Create The Final Dataframe For Modeling

In [None]:
#df_final = 