In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import os

In [None]:

data_path = r'transport/accessibility_indicators_gb'
files = os.listdir(data_path)
print("Files in main folder:", files)

accessibility_path = f'{data_path}/accessibility'
accessibility_files = os.listdir(accessibility_path)
print("\nFiles in accessibility folder:", accessibility_files)

In [None]:

employment_files = os.listdir(f'{accessibility_path}/employment')
print("Employment files:", employment_files)


df = pd.read_csv(f'{accessibility_path}/employment/{employment_files[0]}')
print(df.head())
print(df.shape)
print(df.columns)

In [None]:

folders = ['hospitals', 'schools', 'supermarket', 'gp', 'urban_centre']

for folder in folders:
    folder_path = f'{accessibility_path}/{folder}'
    files = os.listdir(folder_path)
    print(f"\n{folder} files:", files)

In [None]:

employment_df = pd.read_csv(f'{accessibility_path}/employment/access_employment_pt.csv')
hospitals_df = pd.read_csv(f'{accessibility_path}/hospitals/access_hospital_pt.csv')  # singular
schools_df = pd.read_csv(f'{accessibility_path}/schools/access_school_pt.csv')  # singular
supermarket_df = pd.read_csv(f'{accessibility_path}/supermarket/access_supermarkets_pt.csv')  # plural
gp_df = pd.read_csv(f'{accessibility_path}/gp/access_gp_pt.csv')
urban_df = pd.read_csv(f'{accessibility_path}/urban_centre/access_cities_pt.csv')  # cities not urban_centre

print("Employment cols:", employment_df.columns.tolist())
print("Hospitals cols:", hospitals_df.columns.tolist())
print("Schools cols:", schools_df.columns.tolist())

In [None]:

print(df.info())
print(df.describe())
print(df.isnull().sum())

Merge Datasets

In [None]:

df = employment_df[['geo_code', 'geo_label', 'employment_30', 'employment_45', 'employment_60']].copy()

df = df.merge(hospitals_df[['geo_code', 'hospitals_30', 'hospitals_45']], on='geo_code', how='left')
df = df.merge(schools_df[['geo_code', 'school_primary_30', 'school_secondary_30']], on='geo_code', how='left')
df = df.merge(supermarket_df[['geo_code', 'supermarket_30', 'supermarket_45']], on='geo_code', how='left')
df = df.merge(gp_df[['geo_code', 'gp_number_30', 'gp_number_45']], on='geo_code', how='left')

print(df.head())
print(df.shape)
print(df.isnull().sum())

In [None]:
#K-Means Clustering (No Neural Network)

from sklearn.cluster import KMeans
import numpy as np

feature_cols = ['employment_30', 'employment_45', 'employment_60',
                'hospitals_30', 'hospitals_45',
                'school_primary_30', 'school_secondary_30',
                'supermarket_30', 'supermarket_45',
                'gp_number_30', 'gp_number_45']

X = df[feature_cols].copy()
X = X.fillna(X.median())


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

df['cluster'] = cluster_labels

for i in range(3):
    print(f"\nCluster {i}:")
    print(X[cluster_labels == i].mean())

cluster_means = [X[cluster_labels == i].mean().mean() for i in range(3)]
sorted_idx = np.argsort(cluster_means)

label_map = {sorted_idx[0]: 'High Accessibility',
             sorted_idx[1]: 'Medium Accessibility', 
             sorted_idx[2]: 'Low Accessibility'}

df['accessibility'] = df['cluster'].map(label_map)
print("\nFinal Classification:")
print(df['accessibility'].value_counts())

In [None]:
pip install geopandas folium

In [None]:
import geopandas as gpd
import folium
from folium import Choropleth


df_sample = df.sample(5000, random_state=42)

m = folium.Map(location=[54.5, -3.5], zoom_start=6)

colors = {'High Accessibility': 'green', 
          'Medium Accessibility': 'orange', 
          'Low Accessibility': 'red'}

for idx, row in df_sample.iterrows():
    folium.CircleMarker(
        location=[row.get('latitude', 54), row.get('longitude', -3)],  # Need coordinates
        radius=3,
        color=colors[row['accessibility']],
        fill=True,
        popup=f"{row['geo_label']}<br>{row['accessibility']}"
    ).add_to(m)

m.save('accessibility_map.html')
print("Map saved!")

In [None]:

plt.figure(figsize=(10, 6))
df['accessibility'].value_counts().plot(kind='bar', color=['green', 'orange', 'red'])
plt.title('Zone Distribution by Accessibility')
plt.xlabel('Accessibility Level')
plt.ylabel('Number of Zones')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('accessibility_distribution.png')
plt.show()


features = ['employment_30', 'hospitals_30', 'school_primary_30', 
            'supermarket_30', 'gp_number_30']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for idx, feat in enumerate(features):
    ax = axes[idx//3, idx%3]
    df.boxplot(column=feat, by='accessibility', ax=ax)
    ax.set_title(feat)
    
plt.tight_layout()
plt.savefig('feature_comparison.png')
plt.show()

In [None]:

low_access = df[df['accessibility'] == 'Low Accessibility']

print(f"Low Accessibility Zones: {len(low_access)} ({len(low_access)/len(df)*100:.1f}%)")
print("\nTop 10 areas needing intervention:")
print(low_access[['geo_label', 'employment_30', 'hospitals_30', 'supermarket_30']].head(10))

low_access[['geo_code', 'geo_label', 'accessibility', 
            'employment_30', 'hospitals_30', 'school_primary_30']].to_csv('priority_zones.csv', index=False)
print("\nPriority zones exported to priority_zones.csv")

In [None]:
pip install xgboost

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, cluster_labels, test_size=0.2, random_state=42
)


rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_acc = accuracy_score(y_test, rf.predict(X_test))

xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)
xgb_acc = accuracy_score(y_test, xgb.predict(X_test))

print(f"Random Forest: {rf_acc:.4f}")
print(f"XGBoost: {xgb_acc:.4f}")


importances = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop Features:")
print(importances)

In [None]:
# Save model
import pickle
pickle.dump(rf, open('accessibility_model.pkl', 'wb'))

# Save final dataset
df[['geo_code', 'geo_label', 'accessibility', 'cluster']].to_csv('classified_zones.csv', index=False)

print("✓ Model saved")
print("✓ Results saved") 
print("\nProject deliverables:")
print("- accessibility_distribution.png")
print("- feature_comparison.png")
print("- priority_zones.csv")
print("- classified_zones.csv")
print("- accessibility_model.pkl")