In [25]:
#import all the required libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

# Load your dataset (assuming you have a CSV file with columns 'permit_fees' and 'permit_approval')
data = pd.read_csv('data_final.csv')

In [26]:
# Checking for missing/null values
print("Number of missing values in each column:")
print(data.isnull().sum())

# Droping rows with missing values
data.dropna(inplace=True)

# Verifying that missing values are removed
print("Number of missing values after removing:")
print(data.isnull().sum())

cleaned_filename = 'Cleaned_Dataset_after_removing_nulls.csv'
data.to_csv(cleaned_filename, index=False)


Number of missing values in each column:
PERMIT_TYPE            0
REVIEW_TYPE            0
PROCESSING_TIME        0
BUILDING_FEE_PAID      0
ZONING_FEE_PAID        0
OTHER_FEE_PAID         0
SUBTOTAL_PAID          0
BUILDING_FEE_UNPAID    0
ZONING_FEE_UNPAID      0
OTHER_FEE_UNPAID       0
SUBTOTAL_UNPAID        0
BUILDING_FEE_WAIVED    0
ZONING_FEE_WAIVED      0
OTHER_FEE_WAIVED       0
SUBTOTAL_WAIVED        0
TOTAL_FEE              0
REPORTED_COST          0
COMMUNITY_AREA         0
CENSUS_TRACT           0
WARD                   0
IS_PERMIT_VALID        0
dtype: int64
Number of missing values after removing:
PERMIT_TYPE            0
REVIEW_TYPE            0
PROCESSING_TIME        0
BUILDING_FEE_PAID      0
ZONING_FEE_PAID        0
OTHER_FEE_PAID         0
SUBTOTAL_PAID          0
BUILDING_FEE_UNPAID    0
ZONING_FEE_UNPAID      0
OTHER_FEE_UNPAID       0
SUBTOTAL_UNPAID        0
BUILDING_FEE_WAIVED    0
ZONING_FEE_WAIVED      0
OTHER_FEE_WAIVED       0
SUBTOTAL_WAIVED        0
TOTAL

In [27]:
#Exploratory data analysis
print(data.head())
print(data.shape)
print(data.info())
print(data.describe())

                      PERMIT_TYPE           REVIEW_TYPE  PROCESSING_TIME  \
0  PERMIT - RENOVATION/ALTERATION  STANDARD PLAN REVIEW               79   
1                  PERMIT - SIGNS           SIGN PERMIT               70   
2  PERMIT - RENOVATION/ALTERATION  STANDARD PLAN REVIEW               51   
3  PERMIT - RENOVATION/ALTERATION  STANDARD PLAN REVIEW               74   
4       PERMIT - NEW CONSTRUCTION  STANDARD PLAN REVIEW               20   

   BUILDING_FEE_PAID  ZONING_FEE_PAID  OTHER_FEE_PAID  SUBTOTAL_PAID  \
0             450.00             50.0             0.0         500.00   
1             100.00            200.0             0.0         300.00   
2            1101.66             50.0             0.0        1151.66   
3             864.00             75.0           626.0        1565.00   
4            3000.00             75.0             0.0        3075.00   

   BUILDING_FEE_UNPAID  ZONING_FEE_UNPAID  OTHER_FEE_UNPAID  ...  \
0                  0.0                  0 

In [28]:

#dropping ward column to avoid overfitting

data=data.drop(columns=['WARD'], axis=1)




In [29]:
data.head()

Unnamed: 0,PERMIT_TYPE,REVIEW_TYPE,PROCESSING_TIME,BUILDING_FEE_PAID,ZONING_FEE_PAID,OTHER_FEE_PAID,SUBTOTAL_PAID,BUILDING_FEE_UNPAID,ZONING_FEE_UNPAID,OTHER_FEE_UNPAID,SUBTOTAL_UNPAID,BUILDING_FEE_WAIVED,ZONING_FEE_WAIVED,OTHER_FEE_WAIVED,SUBTOTAL_WAIVED,TOTAL_FEE,REPORTED_COST,COMMUNITY_AREA,CENSUS_TRACT,IS_PERMIT_VALID
0,PERMIT - RENOVATION/ALTERATION,STANDARD PLAN REVIEW,79,450.0,50.0,0.0,500.0,0.0,0,0,0.0,0.0,0,0,0.0,500.0,35000.0,7,70700,1
1,PERMIT - SIGNS,SIGN PERMIT,70,100.0,200.0,0.0,300.0,0.0,0,0,0.0,0.0,0,0,0.0,300.0,1500.0,3,830700,1
2,PERMIT - RENOVATION/ALTERATION,STANDARD PLAN REVIEW,51,1101.66,50.0,0.0,1151.66,0.0,0,0,0.0,0.0,0,0,0.0,1151.66,261200.0,65,650200,1
3,PERMIT - RENOVATION/ALTERATION,STANDARD PLAN REVIEW,74,864.0,75.0,626.0,1565.0,0.0,0,0,0.0,0.0,0,0,0.0,1565.0,160000.0,5,50500,1
4,PERMIT - NEW CONSTRUCTION,STANDARD PLAN REVIEW,20,3000.0,75.0,0.0,3075.0,0.0,0,0,0.0,0.0,0,0,0.0,3075.0,350000.0,16,161200,1


In [30]:
# Convert categorical columns using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['PERMIT_TYPE', 'REVIEW_TYPE'], drop_first=True)

# Splitting into features (X) and target (y)
X = data_encoded.drop(columns=['IS_PERMIT_VALID'])  # Features excluding the target column
y = data_encoded['IS_PERMIT_VALID']  # Target column


In [31]:
# Apply PCA for dimensionality reduction
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)



In [32]:
# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict permit approval on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 1.00
Confusion Matrix:
[[   2    0]
 [   0 3201]]
