<a href="https://colab.research.google.com/github/Tar-ive/txst_open_datathon_2025/blob/main/machine_learning_datathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load files

In [None]:
import pandas as pd
import numpy as np

In [None]:
students = pd.read_csv('students.csv')

In [None]:
students.dtypes

Unnamed: 0,0
Campus,object
Class,object
Department,object
Gender,object
Housing,object
Level,object
Semester,object
Year,float64


In [None]:
students['Semester'].value_counts()

Unnamed: 0_level_0,count
Semester,Unnamed: 1_level_1
Fall,111630
Spring,79579
Summer,26621


# Processing the data

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime

class ParkingDataProcessor:
    def __init__(self):
        self.students_df = None
        self.dorms_df = None
        self.green_lots_df = None
        self.parking_proximity = None

    def load_data(self):
        """Load all required datasets"""
        try:
            self.students_df = pd.read_csv('students.csv')
            self.dorms_df = pd.read_csv('dorm.csv')
            self.green_lots_df = pd.read_csv('green_lots.csv')

            print("Data loaded successfully")
            self._print_data_info()

        except Exception as e:
            print(f"Error loading data: {str(e)}")

    def _print_data_info(self):
        """Print information about loaded datasets"""
        if self.students_df is not None:
            print("\n=== Students Data ===")
            print("Shape:", self.students_df.shape)
            print("Columns:", self.students_df.columns.tolist())
            print("Latest year:", self.students_df['Year'].max())
            print("Semesters:", self.students_df['Semester'].unique())

        if self.dorms_df is not None:
            print("\n=== Dorms Data ===")
            print("Shape:", self.dorms_df.shape)
            print("Columns:", self.dorms_df.columns.tolist())

        if self.green_lots_df is not None:
            print("\n=== Green Lots Data ===")
            print("Shape:", self.green_lots_df.shape)
            print("Columns:", self.green_lots_df.columns.tolist())

    def filter_latest_semesters(self):
        """Filter students data for the latest two semesters"""
        if self.students_df is None:
            print("Please load data first")
            return

        # Get the latest year
        latest_year = self.students_df['Year'].max()

        # Filter for the latest year
        latest_data = self.students_df[self.students_df['Year'] == latest_year]

        # Get the two most recent semesters
        semesters = latest_data['Semester'].unique()
        if len(semesters) >= 2:
            latest_semesters = sorted(semesters)[-2:]
        else:
            latest_semesters = semesters

        self.students_df = latest_data[latest_data['Semester'].isin(latest_semesters)]
        print(f"Filtered data for semesters: {latest_semesters} in year {latest_year}")
        print(f"Remaining records: {len(self.students_df)}")

    def calculate_dorm_density(self):
        """Calculate student density for each dorm"""
        if self.students_df is None or self.dorms_df is None:
            print("Please load all required data first")
            return

        # Print unique housing values to debug
        print("\nUnique Housing values in students_df:")
        print(self.students_df['Housing'].value_counts().head(10))

        print("\nUnique dorm names in dorms_df:")
        print(self.dorms_df['dorm_name'].tolist())

        # Count students per dorm with case-insensitive matching
        dorm_counts = (self.students_df['Housing']
                      .str.strip()
                      .str.lower()
                      .value_counts()
                      .reset_index())
        dorm_counts.columns = ['dorm_name_lower', 'student_count']

        # Add lowercase version of dorm names for matching
        self.dorms_df['dorm_name_lower'] = (self.dorms_df['dorm_name']
                                           .str.strip()
                                           .str.lower())

        # Merge with dorm information using lowercase names
        self.dorm_density = pd.merge(
            self.dorms_df,
            dorm_counts,
            left_on='dorm_name_lower',
            right_on='dorm_name_lower',
            how='left'
        )

        # Print merge results
        print("\nMerge results:")
        print("Dorms with zero students:")
        print(self.dorm_density[self.dorm_density['student_count'] == 0]['dorm_name'].tolist())
        print("\nDorms with students:")
        print(self.dorm_density[self.dorm_density['student_count'] > 0][['dorm_name', 'student_count']].to_string())

        # Fill missing values with 0
        self.dorm_density['student_count'] = self.dorm_density['student_count'].fillna(0)

        # Calculate density (students/capacity)
        self.dorm_density['density'] = self.dorm_density['student_count'] / self.dorm_density['capacity']

        print("\n=== Dorm Density Statistics ===")
        print(self.dorm_density[['dorm_name', 'student_count', 'density']].describe())

        return self.dorm_density

    def parse_parking_proximity(self, proximity_data):
        """Parse parking proximity data and calculate parking penalty"""
        self.parking_proximity = proximity_data
        # TODO: Implement parking proximity parsing and penalty calculation

    def run_kmeans_clustering(self, n_clusters=5):
        """Run KMeans clustering on dorm locations"""
        if self.dorm_density is None:
            print("Please calculate dorm density first")
            return

        # Prepare data for clustering
        X = self.dorm_density[['latitude', 'longitude']].values

        # Run KMeans
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        self.dorm_density['cluster'] = kmeans.fit_predict(X)

        # Calculate cluster centers
        self.cluster_centers = pd.DataFrame(
            kmeans.cluster_centers_,
            columns=['latitude', 'longitude']
        )

        print("\n=== Clustering Results ===")
        print("Cluster sizes:")
        print(self.dorm_density['cluster'].value_counts())

        return self.dorm_density, self.cluster_centers


processor = ParkingDataProcessor()
processor.load_data()
processor.filter_latest_semesters()
processor.calculate_dorm_density()
processor.run_kmeans_clustering()

  self.students_df = pd.read_csv('students.csv')


Data loaded successfully

=== Students Data ===
Shape: (980540, 8)
Columns: ['Campus', 'Class', 'Department', 'Gender', 'Housing', 'Level', 'Semester', 'Year']
Latest year: 2024
Semesters: ['Summer' 'Fall' 'Spring']

=== Dorms Data ===
Shape: (25, 6)
Columns: ['dorm_name', 'capacity', 'latitude', 'longitude', 'formatted_address', 'place_id']

=== Green Lots Data ===
Shape: (21, 4)
Columns: ['Location', 'Latitude', 'Longitude', 'Spaces']
Filtered data for semesters: ['Spring', 'Summer'] in year 2024
Remaining records: 47327

Unique Housing values in students_df:
Housing
Off-Campus                   39440
Blanco Hall                    746
Bobcat Village Apartments      628
Vistas Apartments              541
San Jacinto Hall               458
San Marcos Hall                410
Sanctuary Lofts Apartment      406
Jackson Hall                   402
Tower Hall                     394
Sterry Hall                    363
Name: count, dtype: int64

Unique dorm names in dorms_df:
['Alamito', 'Bal

(                    dorm_name  capacity   latitude  longitude  \
 0                     Alamito       505  29.890707 -97.942142   
 1                    Balcones         0  29.888411 -97.938351   
 2                       Bexar       202  29.886377 -97.948098   
 3                      Blanco       594  29.887135 -97.951914   
 4   Bobcat Village Apartments         0  29.893137 -97.922257   
 5                     Brogdon       115  29.888299 -97.939900   
 6                      Butler       236  29.887039 -97.938969   
 7                  Chautauqua       306  29.890471 -97.946585   
 8                      Cibilo       501  29.890119 -97.942420   
 9                 College Inn       280  29.889331 -97.946390   
 10                    Cypress         0  29.888411 -97.938351   
 11        Elena Zamora O'Shea       318  29.890032 -97.953863   
 12                      Falls       286  29.886373 -97.950926   
 13         First Five Freedom       280  29.889315 -97.953551   
 14       

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# First create the data processor to get our initial data
class DataProcessor:
    def __init__(self):
        self.students_df = None
        self.dorms_df = None
        self.green_lots_df = None

    def load_data(self):
        """Load all required datasets"""
        try:
            self.students_df = pd.read_csv('students.csv', low_memory=False)
            self.dorms_df = pd.read_csv('dorm.csv')
            self.green_lots_df = pd.read_csv('green_lots.csv')
            return True
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            return False

    def process_data(self):
        """Process and combine all data"""
        if self.students_df is None or self.dorms_df is None:
            print("Please load data first")
            return None

        # Get latest year data
        latest_year = self.students_df['Year'].max()
        latest_data = self.students_df[self.students_df['Year'] == latest_year]

        # Clean housing names
        def clean_name(name):
            if pd.isna(name): return name
            return str(name).lower().replace(' hall', '').replace(' apartments', '').replace(' apartment', '').strip()

        # Get student counts
        latest_data['Housing_clean'] = latest_data['Housing'].apply(clean_name)
        housing_counts = latest_data['Housing_clean'].value_counts().reset_index()
        housing_counts.columns = ['dorm_name_clean', 'student_count']

        # Prepare dorms data
        dorm_data = self.dorms_df.copy()
        dorm_data['dorm_name_clean'] = dorm_data['dorm_name'].apply(clean_name)

        # Merge with student counts
        dorm_data = pd.merge(
            dorm_data,
            housing_counts,
            left_on='dorm_name_clean',
            right_on='dorm_name_clean',
            how='left'
        )

        # Fill NaN student counts with 0
        dorm_data['student_count'] = dorm_data['student_count'].fillna(0)

        # Perform clustering
        coords = dorm_data[['latitude', 'longitude']].values
        kmeans = KMeans(n_clusters=5, random_state=42)
        dorm_data['cluster'] = kmeans.fit_predict(coords)

        return dorm_data

# Then use the organizer to separate the data
class DormDataOrganizer:
    def __init__(self):
        self.matched_dorms = pd.DataFrame()
        self.unmatched_dorms = pd.DataFrame()
        self.clusters = pd.DataFrame()

    def organize_data(self, dorm_data):
        """Organize dorm data into different categories"""
        # Filter matched dorms (those with students)
        self.matched_dorms = dorm_data[dorm_data['student_count'] > 0].copy()

        # Filter unmatched dorms
        self.unmatched_dorms = dorm_data[dorm_data['student_count'] == 0].copy()

        # Create cluster centers dataframe
        self.clusters = dorm_data.groupby('cluster').agg({
            'latitude': 'mean',
            'longitude': 'mean',
            'student_count': 'sum'
        }).reset_index()

        # Print summary statistics
        self._print_summary()

    def _print_summary(self):
        """Print summary of the organized data"""
        print("\n=== Matched Dorms (With Students) ===")
        if not self.matched_dorms.empty:
            print(self.matched_dorms[['dorm_name', 'student_count', 'cluster']].to_string())

        print("\n=== Unmatched Dorms (No Students) ===")
        if not self.unmatched_dorms.empty:
            print(self.unmatched_dorms[['dorm_name', 'cluster']].to_string())

        print("\n=== Cluster Statistics ===")
        if not self.clusters.empty:
            print(self.clusters.to_string())

    def get_visualization_data(self):
        """Prepare data for visualization"""
        return {
            'matched_dorms': self.matched_dorms.to_dict('records'),
            'unmatched_dorms': self.unmatched_dorms.to_dict('records'),
            'clusters': self.clusters.to_dict('records')
        }

# Usage example:
processor = DataProcessor()
if processor.load_data():
    dorm_data = processor.process_data()
    if dorm_data is not None:
        organizer = DormDataOrganizer()
        organizer.organize_data(dorm_data)
        viz_data = organizer.get_visualization_data()


=== Matched Dorms (With Students) ===
                    dorm_name  student_count  cluster
0                     Alamito          505.0        4
1                    Balcones          430.0        1
2                       Bexar          393.0        3
3                      Blanco         1556.0        3
4   Bobcat Village Apartments         1262.0        2
5                     Brogdon          227.0        1
6                      Butler          476.0        1
7                  Chautauqua          609.0        0
9                 College Inn          564.0        0
10                    Cypress          555.0        1
12                      Falls          582.0        3
15                    Jackson          819.0        0
16                    Lantana          482.0        1
17                     Laurel          279.0        1
18                   Mesquite          167.0        1
19                     Retama          259.0        4
20                San Jacinto          925.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest_data['Housing_clean'] = latest_data['Housing'].apply(clean_name)


# Visualize clusters

In [None]:
green_lots_df = pd.read_csv('green_lots.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

def save_processed_data(dorm_data):
    """
    Split and save the processed data into separate dataframes
    """
    # Split into matched and unmatched dorms
    matched_dorms = dorm_data[dorm_data['student_count'] > 0].copy()
    unmatched_dorms = dorm_data[dorm_data['student_count'] == 0].copy()

    # Calculate cluster centers
    clusters = dorm_data.groupby('cluster').agg({
        'latitude': 'mean',
        'longitude': 'mean',
        'student_count': 'sum'
    }).reset_index()

    # Calculate student density (students per dorm)
    matched_dorms['density'] = matched_dorms['student_count'] / matched_dorms['capacity']
    matched_dorms['density'] = matched_dorms['density'].replace([np.inf, -np.inf], 1.0)

    return matched_dorms, unmatched_dorms, clusters

def calculate_parking_proximity(dorms_df, parking_df):
    """
    Calculate parking proximity for each dorm
    """
    def haversine_distance(lat1, lon1, lat2, lon2):
        """Calculate the distance between two points in kilometers"""
        R = 6371  # Earth's radius in kilometers

        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1

        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        return R * c

    # Create a DataFrame to store proximity information
    proximity_data = []

    # Calculate distances to all parking lots for each dorm
    for _, dorm in dorms_df.iterrows():
        distances = []
        for _, lot in parking_df.iterrows():
            distance = haversine_distance(
                dorm['latitude'], dorm['longitude'],
                lot['Latitude'], lot['Longitude']
            )
            distances.append({
                'dorm_name': dorm['dorm_name'],
                'parking_lot': lot['Location'],
                'distance_km': distance,
                'spaces': lot['Spaces']
            })

        # Sort distances and get the closest lots
        distances.sort(key=lambda x: x['distance_km'])
        closest_lots = distances[:3]  # Get top 3 closest lots

        # Calculate proximity score (weighted by number of spaces and distance)
        proximity_score = sum(lot['spaces'] / (lot['distance_km'] + 0.1) for lot in closest_lots)

        proximity_data.append({
            'dorm_name': dorm['dorm_name'],
            'closest_lots': [lot['parking_lot'] for lot in closest_lots],
            'closest_distances': [lot['distance_km'] for lot in closest_lots],
            'total_nearby_spaces': sum(lot['spaces'] for lot in closest_lots),
            'proximity_score': proximity_score
        })

    return pd.DataFrame(proximity_data)

def calculate_desirability(dorms_df, proximity_df):
    """
    Calculate desirability score for each dorm
    Components:
    1. Student density (normalized)
    2. Proximity score (normalized)
    3. Parking penalty (based on existing spaces)
    """
    # Merge dorm data with proximity data
    desirability_df = pd.merge(dorms_df, proximity_df, on='dorm_name', how='left')

    # Normalize student density
    desirability_df['density_norm'] = (desirability_df['density'] - desirability_df['density'].min()) / \
                                    (desirability_df['density'].max() - desirability_df['density'].min())

    # Normalize proximity score
    desirability_df['proximity_norm'] = (desirability_df['proximity_score'] - desirability_df['proximity_score'].min()) / \
                                      (desirability_df['proximity_score'].max() - desirability_df['proximity_score'].min())

    # Calculate parking penalty (inverse of total nearby spaces, normalized)
    max_spaces = desirability_df['total_nearby_spaces'].max()
    desirability_df['parking_penalty'] = 1 - (desirability_df['total_nearby_spaces'] / max_spaces)

    # Calculate final desirability score
    # Higher score = more desirable for new parking
    desirability_df['desirability_score'] = (
        (0.4 * desirability_df['density_norm']) +  # Weight for student density
        (0.4 * desirability_df['proximity_norm']) +  # Weight for location proximity
        (0.2 * desirability_df['parking_penalty'])   # Weight for parking penalty
    )

    return desirability_df

# Main execution
def process_and_save_all(dorm_data, green_lots_df):
    """
    Process all data and save results
    """
    # Step 1: Split and save initial processed data
    matched_dorms, unmatched_dorms, clusters = save_processed_data(dorm_data)
    print("\nInitial data processing complete")
    print(f"Matched dorms: {len(matched_dorms)}")
    print(f"Unmatched dorms: {len(unmatched_dorms)}")
    print(f"Clusters: {len(clusters)}")

    # Step 2: Calculate parking proximity
    proximity_df = calculate_parking_proximity(matched_dorms, green_lots_df)
    print("\nParking proximity calculated")
    print("Sample proximity scores:")
    print(proximity_df[['dorm_name', 'proximity_score']].head())

    # Step 3: Calculate desirability
    desirability_df = calculate_desirability(matched_dorms, proximity_df)
    print("\nDesirability scores calculated")
    print("\nTop 5 dorms by desirability:")
    print(desirability_df.nlargest(5, 'desirability_score')[
        ['dorm_name', 'density_norm', 'proximity_norm', 'parking_penalty', 'desirability_score']
    ])

    return {
        'matched_dorms': matched_dorms,
        'unmatched_dorms': unmatched_dorms,
        'clusters': clusters,
        'proximity': proximity_df,
        'desirability': desirability_df
    }

# Usage
results = process_and_save_all(dorm_data, green_lots_df)


Initial data processing complete
Matched dorms: 21
Unmatched dorms: 4
Clusters: 5

Parking proximity calculated
Sample proximity scores:
                   dorm_name  proximity_score
0                    Alamito      2729.125201
1                   Balcones      1614.213376
2                      Bexar      3683.949491
3                     Blanco      1709.215036
4  Bobcat Village Apartments       251.577795

Desirability scores calculated

Top 5 dorms by desirability:
      dorm_name  density_norm  proximity_norm  parking_penalty  \
16  San Jacinto      0.600348        1.000000         0.230137   
3        Blanco      1.000000        0.365328         0.332877   
19  Sterry Hall      0.609142        0.730285         0.293151   
2         Bexar      0.583839        0.860256         0.071233   
20        Tower      0.554864        0.679042         0.300000   

    desirability_score  
16            0.686167  
3             0.612706  
19            0.594401  
2             0.591885  
20

In [None]:
results

{'matched_dorms':                     dorm_name  capacity   latitude  longitude  \
 0                     Alamito       505  29.890707 -97.942142   
 1                    Balcones         0  29.888411 -97.938351   
 2                       Bexar       202  29.886377 -97.948098   
 3                      Blanco       594  29.887135 -97.951914   
 4   Bobcat Village Apartments         0  29.893137 -97.922257   
 5                     Brogdon       115  29.888299 -97.939900   
 6                      Butler       236  29.887039 -97.938969   
 7                  Chautauqua       306  29.890471 -97.946585   
 9                 College Inn       280  29.889331 -97.946390   
 10                    Cypress         0  29.888411 -97.938351   
 12                      Falls       286  29.886373 -97.950926   
 15                    Jackson       423  29.889918 -97.944548   
 16                    Lantana       239  29.887011 -97.939732   
 17                     Laurel       141  29.887615 -97.940

In [None]:
matched_dorms = results['matched_dorms']
unmatched_dorms = results['unmatched_dorms']
clusters = results['clusters']
proximity_df = results['proximity']
desirability_df = results['desirability']

In [None]:
print(matched_dorms.columns)


Index(['dorm_name', 'capacity', 'latitude', 'longitude', 'formatted_address',
       'place_id', 'dorm_name_clean', 'student_count', 'cluster', 'density'],
      dtype='object')


In [None]:
desirability_df.columns

Index(['dorm_name', 'capacity', 'latitude', 'longitude', 'formatted_address',
       'place_id', 'dorm_name_clean', 'student_count', 'cluster', 'density',
       'closest_lots', 'closest_distances', 'total_nearby_spaces',
       'proximity_score', 'density_norm', 'proximity_norm', 'parking_penalty',
       'desirability_score'],
      dtype='object')

In [None]:
import pandas as pd

# Assuming matched_dorms and desirability_df are already loaded
# Merge the two DataFrames based on the 'dorm_name' column
merged_df = pd.merge(matched_dorms, desirability_df[['dorm_name', 'desirability_score']], on='dorm_name', how='left')

# Create color based on desirability_score
merged_df['color'] = merged_df['desirability_score'].apply(
    lambda score: f'{int(255 * (1 - score)):02x}{int(255 * score):02x}00' if pd.notna(score) else '808080'
)

# Save the final DataFrame to a CSV file
merged_df.to_csv('final_dorm_data.csv', index=False)

# Optional: Verify the first few rows of the final DataFrame
print(merged_df.head())


                   dorm_name  capacity   latitude  longitude  \
0                    Alamito       505  29.890707 -97.942142   
1                   Balcones         0  29.888411 -97.938351   
2                      Bexar       202  29.886377 -97.948098   
3                     Blanco       594  29.887135 -97.951914   
4  Bobcat Village Apartments         0  29.893137 -97.922257   

                                   formatted_address  \
0         102 Russell Cir, San Marcos, TX 78666, USA   
1       601 University Dr, San Marcos, TX 78666, USA   
2           100 Llano Cir, San Marcos, TX 78666, USA   
3            701 Moore St, San Marcos, TX 78666, USA   
4  1301 Aquarena Springs Dr, San Marcos, TX 78666...   

                      place_id dorm_name_clean  student_count  cluster  \
0  ChIJXdq6QwCpXIYR3Llz2hvdYSY         alamito          505.0        4   
1  ChIJ3yStNnCoXIYRkulAp54VF3E        balcones          430.0        1   
2  ChIJ7Xcbw2-oXIYR1YezPAGFypY           bexar          

In [None]:
import folium
import pandas as pd

# Assuming 'merged_df' contains the necessary columns: ['dorm_name', 'latitude', 'longitude', 'color']

# Initialize the map centered on a campus location
m = folium.Map(location=[29.889, -97.942], zoom_start=15, tiles='cartodbpositron')

# Add dorm locations to the map
for _, row in merged_df.iterrows():
    # Extract necessary data from the row
    dorm_name = row['dorm_name']
    lat = row['latitude']
    lon = row['longitude']
    color = row['color']

    # Add a marker for each dorm
    folium.CircleMarker(
        location=[lat, lon],
        radius=8,  # Size of the marker
        color='#' + color,  # Color from desirability_score
        fill=True,
        fill_color='#' + color,
        fill_opacity=0.7,
        popup=dorm_name
    ).add_to(m)

# Save map as HTML
m.save('campus_dorms_map.html')

# Optionally, if using Jupyter, you can display the map directly
m


In [None]:
print(merged_df.columns)


Index(['dorm_name', 'capacity', 'latitude', 'longitude', 'formatted_address',
       'place_id', 'dorm_name_clean', 'student_count', 'cluster', 'density',
       'desirability_score', 'color'],
      dtype='object')


In [None]:
import folium
from folium import plugins
import pandas as pd
import numpy as np

def create_enhanced_campus_map(merged_df, clusters, green_lots_df):
    """
    Create an enhanced interactive map showing dorms, clusters, and parking lots
    with improved visual elements
    """
    # Initialize map
    m = folium.Map(
        location=[29.889, -97.942],
        zoom_start=15,
        tiles='cartodbpositron'
    )

    # Define color scheme
    cluster_colors = {
        0: '#FF6B6B',  # Red
        1: '#4ECDC4',  # Teal
        2: '#45B7D1',  # Blue
        3: '#96CEB4',  # Green
        4: '#FFEEAD'   # Yellow
    }

    # Create feature groups for different layers
    cluster_group = folium.FeatureGroup(name='Clusters')
    dorm_group = folium.FeatureGroup(name='Dorms')
    parking_group = folium.FeatureGroup(name='Parking Lots')

    # Add cluster areas
    for idx, cluster in clusters.iterrows():
        folium.Circle(
            location=[cluster['latitude'], cluster['longitude']],
            radius=200,  # meters
            color=cluster_colors[cluster['cluster']],
            fill=True,
            fill_opacity=0.2,
            weight=2,
            dash_array='5',
            popup=f'Cluster {cluster["cluster"]}<br>Total Students: {cluster["student_count"]}',
        ).add_to(cluster_group)

    # Add dorms with enhanced markers
    for idx, row in merged_df.iterrows():
        # Calculate marker size based on student count
        student_count = row['student_count']
        radius = np.sqrt(student_count) / 3 if student_count > 0 else 5

        # Calculate color based on desirability score
        desirability = row['desirability_score']
        color = f'#{int(255 * (1-desirability)):02x}{int(255 * desirability):02x}00'

        # Create detailed popup content
        popup_html = f"""
            <div style="font-family: Arial; min-width: 200px;">
                <h4 style="margin: 0;">{row['dorm_name']}</h4>
                <hr style="margin: 5px 0;">
                <p style="margin: 5px 0;">
                    <b>Students:</b> {int(student_count)}<br>
                    <b>Cluster:</b> {row['cluster']}<br>
                    <b>Desirability Score:</b> {desirability:.3f}<br>
                    <b>Density:</b> {row.get('density', 'N/A')}<br>
                    <b>Proximity Score:</b> {row.get('proximity_score', 'N/A')}
                </p>
            </div>
        """

        # Add dorm marker
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=radius,
            color='black',
            weight=2,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            popup=folium.Popup(popup_html, max_width=300),
            tooltip=f"{row['dorm_name']} ({int(student_count)} students)"
        ).add_to(dorm_group)

    # Add parking lots
    for idx, lot in green_lots_df.iterrows():
        folium.CircleMarker(
            location=[lot['Latitude'], lot['Longitude']],
            radius=8,
            color='blue',
            fill=True,
            fill_opacity=0.6,
            weight=2,
            popup=f"Parking Lot: {lot['Location']}<br>Spaces: {lot['Spaces']}",
            tooltip=lot['Location']
        ).add_to(parking_group)

    # Add all feature groups to map
    cluster_group.add_to(m)
    parking_group.add_to(m)
    dorm_group.add_to(m)

    # Add layer control
    folium.LayerControl().add_to(m)

    # Add legend
    legend_html = """
        <div style="position: fixed; bottom: 50px; right: 50px; width: 250px;
                    z-index:9999; background-color: white; padding: 10px; border: 2px solid gray;">
            <h4 style="margin-top: 0;">Legend</h4>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: #00ff00; margin-right: 5px;"></span>
                High Desirability
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: #ff0000; margin-right: 5px;"></span>
                Low Desirability
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: blue; margin-right: 5px;"></span>
                Parking Lot
            </div>
            <hr style="margin: 5px 0;">
            <div style="font-size: 12px;">
                • Circle size indicates student population<br>
                • Color indicates desirability score<br>
                • Colored areas show cluster regions<br>
                • Use layers control to toggle visibility
            </div>
        </div>
    """
    m.get_root().html.add_child(folium.Element(legend_html))

    return m

# Usage
def visualize_campus_data(merged_df, clusters, green_lots_df):
    """
    Create and save the visualization
    """
    # Create the map
    campus_map = create_enhanced_campus_map(merged_df, clusters, green_lots_df)

    # Save the map
    campus_map.save('campus_analysis.html')

    # For Colab display
    from IPython.display import IFrame
    return IFrame(src='campus_analysis.html', width=800, height=600)

# Example usage:
visualization = visualize_campus_data(merged_df, clusters, green_lots_df)
display(visualization)

# generating recommendations


In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

def recommend_parking_locations(merged_df, green_lots_df, clusters, n_recommendations=3):
    """
    Generate recommended parking locations based on:
    1. High desirability areas
    2. Distance from existing parking
    3. Cluster centroids
    """

    def haversine_distance(lat1, lon1, lat2, lon2):
        """Calculate distance between points in km"""
        R = 6371  # Earth's radius in kilometers

        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1

        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        return R * c

    # Get existing parking locations
    existing_parking = green_lots_df[['Latitude', 'Longitude']].values

    # Get high desirability dorms
    high_desirability = merged_df.nlargest(5, 'desirability_score')

    # Generate potential spots around high desirability dorms
    potential_spots = []
    for _, dorm in high_desirability.iterrows():
        # Generate points in a grid around the dorm
        lat_offsets = np.linspace(-0.001, 0.001, 5)
        lon_offsets = np.linspace(-0.001, 0.001, 5)

        for lat_offset in lat_offsets:
            for lon_offset in lon_offsets:
                spot = {
                    'latitude': dorm['latitude'] + lat_offset,
                    'longitude': dorm['longitude'] + lon_offset,
                    'base_score': dorm['desirability_score'],
                    'nearest_dorm': dorm['dorm_name']
                }

                # Calculate minimum distance to existing parking
                min_dist = float('inf')
                for _, lot in green_lots_df.iterrows():
                    dist = haversine_distance(
                        spot['latitude'], spot['longitude'],
                        lot['Latitude'], lot['Longitude']
                    )
                    min_dist = min(min_dist, dist)

                spot['distance_to_parking'] = min_dist

                # Only consider spots that are at least 0.2 km from existing parking
                if min_dist > 0.2:
                    potential_spots.append(spot)

    # Score potential spots
    for spot in potential_spots:
        # Calculate average distance to high desirability dorms
        dorm_distances = []
        for _, dorm in high_desirability.iterrows():
            dist = haversine_distance(
                spot['latitude'], spot['longitude'],
                dorm['latitude'], dorm['longitude']
            )
            dorm_distances.append(dist)

        spot['avg_distance_to_dorms'] = np.mean(dorm_distances)

        # Final score combines base desirability, distance to parking, and accessibility
        spot['final_score'] = (
            0.4 * spot['base_score'] +
            0.3 * min(1.0, spot['distance_to_parking']) +
            0.3 * (1 / (1 + spot['avg_distance_to_dorms']))
        )

    # Sort spots by final score and get top recommendations
    recommendations = sorted(potential_spots, key=lambda x: x['final_score'], reverse=True)[:n_recommendations]

    # Convert to DataFrame
    recommendations_df = pd.DataFrame(recommendations)

    # Add evaluation metrics
    recommendations_df['nearby_dorms'] = recommendations_df.apply(
        lambda x: [d['dorm_name'] for d in high_desirability.to_dict('records')
                  if haversine_distance(x['latitude'], x['longitude'],
                                      d['latitude'], d['longitude']) < 0.5],
        axis=1
    )

    return recommendations_df

def visualize_recommendations(merged_df, green_lots_df, recommendations_df):
    """Create a map with recommended parking locations"""
    m = folium.Map(
        location=[29.889, -97.942],
        zoom_start=15,
        tiles='cartodbpositron'
    )

    # Add existing dorms
    for _, dorm in merged_df.iterrows():
        folium.CircleMarker(
            location=[dorm['latitude'], dorm['longitude']],
            radius=8,
            color='blue',
            fill=True,
            popup=f"{dorm['dorm_name']}<br>Desirability: {dorm['desirability_score']:.3f}",
            tooltip=dorm['dorm_name']
        ).add_to(m)

    # Add existing parking
    for _, lot in green_lots_df.iterrows():
        folium.CircleMarker(
            location=[lot['Latitude'], lot['Longitude']],
            radius=6,
            color='gray',
            fill=True,
            popup=f"Existing Parking: {lot['Location']}<br>Spaces: {lot['Spaces']}",
            tooltip=f"Parking: {lot['Location']}"
        ).add_to(m)

    # Add recommended spots
    for idx, spot in recommendations_df.iterrows():
        popup_html = f"""
            <div style="font-family: Arial; min-width: 200px;">
                <h4 style="margin: 0;">Recommended Spot {idx + 1}</h4>
                <hr style="margin: 5px 0;">
                <p style="margin: 5px 0;">
                    <b>Score:</b> {spot['final_score']:.3f}<br>
                    <b>Nearest Dorm:</b> {spot['nearest_dorm']}<br>
                    <b>Distance to Parking:</b> {spot['distance_to_parking']:.3f} km<br>
                    <b>Nearby Dorms:</b> {', '.join(spot['nearby_dorms'])}
                </p>
            </div>
        """

        folium.CircleMarker(
            location=[spot['latitude'], spot['longitude']],
            radius=10,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.7,
            popup=folium.Popup(popup_html, max_width=300),
            tooltip=f"Recommended Spot {idx + 1}"
        ).add_to(m)

    # Add legend
    legend_html = """
        <div style="position: fixed; bottom: 50px; right: 50px; width: 200px;
                    z-index:9999; background-color: white; padding: 10px; border: 2px solid gray;">
            <h4 style="margin-top: 0;">Legend</h4>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: red; margin-right: 5px;"></span>
                Recommended Spots
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: blue; margin-right: 5px;"></span>
                Existing Dorms
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: gray; margin-right: 5px;"></span>
                Existing Parking
            </div>
        </div>
    """
    m.get_root().html.add_child(folium.Element(legend_html))

    return m

# Usage
recommendations_df = recommend_parking_locations(merged_df, green_lots_df, clusters)
recommendations_map = visualize_recommendations(merged_df, green_lots_df, recommendations_df)
recommendations_map.save('parking_recommendations.html')

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between points in km"""
    R = 6371  # Earth's radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def analyze_potential_spots(merged_df, green_lots_df, min_distance_from_parking=0.15):
    """
    Analyze and debug potential parking spot recommendations
    """
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371  # Earth's radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        return R * c

    # Sort dorms by desirability
    high_desirability_dorms = merged_df.nlargest(10, 'desirability_score')

    # Debug prints
    print("Top 10 High Desirability Dorms:")
    print(high_desirability_dorms[['dorm_name', 'desirability_score', 'student_count']].to_string())
    print("\nExisting Parking Lots:")
    print(green_lots_df[['Location', 'Spaces']].to_string())

    # Generate potential spots
    potential_spots = []
    grid_size = 7  # Increase grid size

    # Generate spots around high desirability dorms
    for _, dorm in high_desirability_dorms.iterrows():
        # Create a grid of points around each dorm
        lat_offsets = np.linspace(-0.002, 0.002, grid_size)  # Increased range
        lon_offsets = np.linspace(-0.002, 0.002, grid_size)  # Increased range

        for lat_offset in lat_offsets:
            for lon_offset in lon_offsets:
                lat = dorm['latitude'] + lat_offset
                lon = dorm['longitude'] + lon_offset

                # Check distance to existing parking
                min_parking_distance = float('inf')
                for _, lot in green_lots_df.iterrows():
                    dist = haversine_distance(lat, lon, lot['Latitude'], lot['Longitude'])
                    min_parking_distance = min(min_parking_distance, dist)

                # Calculate distance to all high desirability dorms
                dorm_distances = []
                nearby_dorms = []
                for _, other_dorm in high_desirability_dorms.iterrows():
                    dist = haversine_distance(lat, lon,
                                           other_dorm['latitude'],
                                           other_dorm['longitude'])
                    dorm_distances.append(dist)
                    if dist < 0.5:  # Within 500m
                        nearby_dorms.append(other_dorm['dorm_name'])

                # Only consider spots that are:
                # 1. Far enough from existing parking
                # 2. Close to at least one dorm
                if min_parking_distance > min_distance_from_parking and len(nearby_dorms) > 0:
                    avg_dorm_distance = np.mean(dorm_distances)
                    spot = {
                        'latitude': lat,
                        'longitude': lon,
                        'distance_to_parking': min_parking_distance,
                        'avg_distance_to_dorms': avg_dorm_distance,
                        'nearby_dorms': nearby_dorms,
                        'num_nearby_dorms': len(nearby_dorms),
                        'base_score': dorm['desirability_score'],
                        'closest_dorm': dorm['dorm_name']
                    }

                    # Calculate final score
                    spot['final_score'] = (
                        0.3 * spot['base_score'] +  # Base desirability
                        0.3 * (1 / (1 + spot['avg_distance_to_dorms'])) +  # Proximity to dorms
                        0.2 * min(1.0, spot['distance_to_parking']) +  # Distance from existing parking
                        0.2 * (spot['num_nearby_dorms'] / 5)  # Number of dorms served
                    )

                    potential_spots.append(spot)

    # Convert to DataFrame and sort
    spots_df = pd.DataFrame(potential_spots)
    if len(spots_df) > 0:
        spots_df = spots_df.sort_values('final_score', ascending=False)

        print(f"\nFound {len(spots_df)} potential spots")
        print("\nTop 5 Recommended Spots:")
        print(spots_df.head()[['closest_dorm', 'num_nearby_dorms', 'distance_to_parking', 'final_score']].to_string())
    else:
        print("\nNo valid spots found! Adjusting constraints might be necessary.")

    return spots_df

def get_top_recommendations(spots_df, n=3, min_distance_between=0.1):
    """
    Get top N recommendations ensuring minimum distance between recommendations
    """
    if len(spots_df) == 0:
        return pd.DataFrame()

    recommendations = []
    remaining_spots = spots_df.copy()

    while len(recommendations) < n and len(remaining_spots) > 0:
        # Get highest scoring remaining spot
        best_spot = remaining_spots.iloc[0]
        recommendations.append(best_spot)

        # Filter out spots that are too close to this recommendation
        if len(remaining_spots) > 1:
            distances = remaining_spots.apply(lambda x: haversine_distance(
                x['latitude'], x['longitude'],
                best_spot['latitude'], best_spot['longitude']
            ), axis=1)

            remaining_spots = remaining_spots[distances > min_distance_between]

    return pd.DataFrame(recommendations)

# Debug usage
spots_df = analyze_potential_spots(merged_df, green_lots_df)
recommendations = get_top_recommendations(spots_df)

print("\nFinal Recommendations:")
if len(recommendations) > 0:
    print(recommendations[['closest_dorm', 'num_nearby_dorms', 'final_score']].to_string())
else:
    print("No valid recommendations found!")

Top 10 High Desirability Dorms:
      dorm_name  desirability_score  student_count
16  San Jacinto            0.686167          925.0
3        Blanco            0.612706         1556.0
19  Sterry Hall            0.594401          737.0
2         Bexar            0.591885          393.0
20        Tower            0.553562          824.0
17   San Marcos            0.515891          830.0
12      Lantana            0.498576          482.0
10        Falls            0.485689          582.0
18       Sayers            0.480436          588.0
6        Butler            0.471555          476.0

Existing Parking Lots:
                      Location  Spaces
0               R56-SHC Thorpe      10
1              301-Llano Drive      95
2              202-College Inn      30
3              115-Residential      85
4        Edward Gary St Garage     196
5         112-San Jacinto Hall      60
6               105-Admissions      35
7                  204-Jackson      46
8           304-Academy Garage  

In [None]:
import folium
from folium import plugins

def visualize_recommendations_enhanced(merged_df, green_lots_df, recommendations):
    """Create an enhanced visualization of parking recommendations"""

    # Initialize the map
    m = folium.Map(
        location=[29.889, -97.942],
        zoom_start=15,
        tiles='cartodbpositron'
    )

    # Add existing dorms with size based on desirability
    for _, dorm in merged_df.iterrows():
        # Calculate circle size based on desirability
        radius = 8 + (dorm['desirability_score'] * 10)

        folium.CircleMarker(
            location=[dorm['latitude'], dorm['longitude']],
            radius=radius,
            color='blue',
            fill=True,
            fill_opacity=0.6,
            popup=f"""
                <div style='font-family: Arial'>
                    <b>{dorm['dorm_name']}</b><br>
                    Desirability: {dorm['desirability_score']:.3f}<br>
                    Students: {int(dorm['student_count'])}
                </div>
            """,
            tooltip=dorm['dorm_name']
        ).add_to(m)

    # Add existing parking lots
    parking_group = folium.FeatureGroup(name='Existing Parking')
    for _, lot in green_lots_df.iterrows():
        folium.CircleMarker(
            location=[lot['Latitude'], lot['Longitude']],
            radius=6,
            color='gray',
            fill=True,
            popup=f"Parking: {lot['Location']}<br>Spaces: {lot['Spaces']}",
            tooltip=f"Parking: {lot['Location']}"
        ).add_to(parking_group)
    parking_group.add_to(m)

    # Add recommended spots with detailed information
    if len(recommendations) > 0:
        for idx, spot in recommendations.iterrows():
            # Create circular area of influence
            folium.Circle(
                location=[spot['latitude'], spot['longitude']],
                radius=200,  # 200m radius
                color='red',
                fill=True,
                fill_opacity=0.1
            ).add_to(m)

            # Create marker for the spot
            popup_html = f"""
                <div style='font-family: Arial; min-width: 200px;'>
                    <h4 style='margin: 0;'>Recommended Spot {idx + 1}</h4>
                    <hr style='margin: 5px 0;'>
                    <p style='margin: 5px 0;'>
                        <b>Score:</b> {spot['final_score']:.3f}<br>
                        <b>Nearby Dorms:</b> {len(spot['nearby_dorms'])}<br>
                        <b>Distance to Parking:</b> {spot['distance_to_parking']:.3f} km<br>
                        <b>Serves:</b> {', '.join(spot['nearby_dorms'][:3])}...
                    </p>
                </div>
            """

            folium.CircleMarker(
                location=[spot['latitude'], spot['longitude']],
                radius=10,
                color='red',
                fill=True,
                fill_opacity=0.7,
                popup=folium.Popup(popup_html, max_width=300),
                tooltip=f"Recommended Spot {idx + 1}"
            ).add_to(m)

    # Add heatmap of student density
    locations = merged_df[['latitude', 'longitude', 'student_count']].values.tolist()
    if locations:
        plugins.HeatMap(locations).add_to(m)

    # Add legend
    legend_html = """
        <div style="position: fixed; bottom: 50px; right: 50px; width: 200px;
                    z-index:9999; background-color: white; padding: 10px; border: 2px solid gray;">
            <h4 style="margin-top: 0;">Legend</h4>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: red; margin-right: 5px;"></span>
                Recommended Spots
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: blue; margin-right: 5px;"></span>
                Existing Dorms
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: gray; margin-right: 5px;"></span>
                Existing Parking
            </div>
        </div>
    """
    m.get_root().html.add_child(folium.Element(legend_html))

    return m

# Create visualization
recommendation_map = visualize_recommendations_enhanced(merged_df, green_lots_df, recommendations)
recommendation_map.save('enhanced_parking_recommendations.html')

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import folium
from folium import plugins

class ParkingRecommendationSystem:
    def __init__(self):
        self.consolidated_data = None
        self.merged_df = None
        self.parking_utilization = None

    def load_data(self, consolidated_file: str, merged_df: pd.DataFrame):
        """
        Load and prepare all data sources
        """
        # Load consolidated data
        self.consolidated_data = pd.read_excel(consolidated_file)
        self.consolidated_data = self.consolidated_data.rename(columns={
            'Name': 'dorm_name',
            'Bed Capacity': 'total_beds',
            'Spaces': 'total_parking',
            'Counter': 'used_parking'
        })

        # Merge with existing data
        self.merged_df = merged_df.merge(
            self.consolidated_data[['dorm_name', 'total_beds', 'total_parking', 'used_parking']],
            on='dorm_name',
            how='left'
        )

        # Calculate parking utilization metrics
        self.calculate_parking_metrics()

    def calculate_parking_metrics(self):
        """
        Calculate parking utilization and demand metrics
        """
        self.merged_df['parking_utilization'] = (
            self.merged_df['used_parking'] / self.merged_df['total_parking']
        ).fillna(0)

        # Calculate parking demand score
        self.merged_df['parking_demand'] = (
            (self.merged_df['student_count'] / self.merged_df['total_beds']) *
            (1 + self.merged_df['parking_utilization'])
        ).fillna(0)

        # Update desirability score with parking demand
        self.merged_df['enhanced_desirability'] = (
            0.3 * self.merged_df['desirability_score'] +
            0.4 * self.merged_df['parking_demand'] +
            0.3 * (1 - self.merged_df['parking_utilization'])
        )

    def recommend_parking_locations(self, n_recommendations=3, min_distance=0.2):
        """
        Generate parking recommendations considering utilization data
        """
        def haversine_distance(lat1, lon1, lat2, lon2):
            R = 6371
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
            c = 2 * np.arcsin(np.sqrt(a))
            return R * c

        # Get high demand dorms
        high_demand = self.merged_df.nlargest(5, 'enhanced_desirability')

        potential_spots = []
        for _, dorm in high_demand.iterrows():
            # Generate grid of potential spots
            lat_offsets = np.linspace(-0.002, 0.002, 7)
            lon_offsets = np.linspace(-0.002, 0.002, 7)

            for lat_offset in lat_offsets:
                for lon_offset in lon_offsets:
                    spot = {
                        'latitude': dorm['latitude'] + lat_offset,
                        'longitude': dorm['longitude'] + lon_offset,
                        'base_dorm': dorm['dorm_name'],
                        'demand_score': dorm['enhanced_desirability']
                    }

                    # Calculate distances to all dorms
                    distances = []
                    served_dorms = []
                    total_demand = 0

                    for _, other_dorm in self.merged_df.iterrows():
                        dist = haversine_distance(
                            spot['latitude'], spot['longitude'],
                            other_dorm['latitude'], other_dorm['longitude']
                        )
                        if dist < 0.5:  # Within 500m
                            served_dorms.append(other_dorm['dorm_name'])
                            total_demand += other_dorm['parking_demand']
                        distances.append(dist)

                    spot['served_dorms'] = served_dorms
                    spot['avg_distance'] = np.mean(distances)
                    spot['total_demand'] = total_demand
                    spot['score'] = (
                        0.4 * total_demand +
                        0.3 * (1 / (1 + spot['avg_distance'])) +
                        0.3 * len(served_dorms)
                    )

                    potential_spots.append(spot)

        # Sort and filter spots
        spots_df = pd.DataFrame(potential_spots)
        recommendations = []

        while len(recommendations) < n_recommendations and len(spots_df) > 0:
            best_spot = spots_df.nlargest(1, 'score').iloc[0]
            recommendations.append(best_spot)

            # Remove nearby spots
            spots_df = spots_df[spots_df.apply(lambda x:
                haversine_distance(x['latitude'], x['longitude'],
                                 best_spot['latitude'], best_spot['longitude']) > min_distance,
                axis=1)]

        return pd.DataFrame(recommendations)

    def visualize_recommendations(self, recommendations):
        """
        Create enhanced visualization with utilization data
        """
        m = folium.Map(
            location=[29.889, -97.942],
            zoom_start=15,
            tiles='cartodbpositron'
        )

        # Add dorms with utilization info
        for _, dorm in self.merged_df.iterrows():
            # Color based on parking utilization
            util_color = f'#{int(255 * dorm["parking_utilization"]):02x}' + \
                        f'{int(255 * (1-dorm["parking_utilization"])):02x}00'

            popup_html = f"""
                <div style="font-family: Arial; min-width: 200px;">
                    <h4 style="margin: 0;">{dorm['dorm_name']}</h4>
                    <hr style="margin: 5px 0;">
                    <p style="margin: 5px 0;">
                        <b>Students:</b> {int(dorm['student_count'])}<br>
                        <b>Parking Usage:</b> {int(dorm['used_parking'])}/{int(dorm['total_parking'])}<br>
                        <b>Utilization:</b> {dorm['parking_utilization']:.1%}<br>
                        <b>Demand Score:</b> {dorm['parking_demand']:.2f}
                    </p>
                </div>
            """

            folium.CircleMarker(
                location=[dorm['latitude'], dorm['longitude']],
                radius=10,
                color='black',
                weight=2,
                fill=True,
                fill_color=util_color,
                fill_opacity=0.7,
                popup=folium.Popup(popup_html, max_width=300),
                tooltip=dorm['dorm_name']
            ).add_to(m)

        # Add recommended spots
        for idx, spot in recommendations.iterrows():
            # Show service area
            folium.Circle(
                location=[spot['latitude'], spot['longitude']],
                radius=200,
                color='red',
                fill=True,
                fill_opacity=0.1
            ).add_to(m)

            popup_html = f"""
                <div style="font-family: Arial; min-width: 200px;">
                    <h4 style="margin: 0;">Recommended Spot {idx + 1}</h4>
                    <hr style="margin: 5px 0;">
                    <p style="margin: 5px 0;">
                        <b>Score:</b> {spot['score']:.3f}<br>
                        <b>Total Demand:</b> {spot['total_demand']:.2f}<br>
                        <b>Serves:</b> {', '.join(spot['served_dorms'][:3])}...<br>
                        <b>Average Distance:</b> {spot['avg_distance']:.3f} km
                    </p>
                </div>
            """

            folium.CircleMarker(
                location=[spot['latitude'], spot['longitude']],
                radius=12,
                color='red',
                fill=True,
                fill_opacity=0.7,
                popup=folium.Popup(popup_html, max_width=300),
                tooltip=f"Recommended Spot {idx + 1}"
            ).add_to(m)

        # Add heatmap of parking demand
        locations = self.merged_df[['latitude', 'longitude', 'parking_demand']].values.tolist()
        plugins.HeatMap(locations).add_to(m)

        return m

# Usage example:
def run_analysis(consolidated_file: str, merged_df: pd.DataFrame):
    system = ParkingRecommendationSystem()
    system.load_data(consolidated_file, merged_df)
    recommendations = system.recommend_parking_locations()
    map_viz = system.visualize_recommendations(recommendations)
    map_viz.save('parking_recommendations_with_utilization.html')
    return recommendations, system.merged_df

In [None]:
import folium
from folium import plugins
import pandas as pd
import numpy as np

def create_parking_visualization(merged_df, consolidated_data, green_lots_df):
    """
    Create comprehensive visualization of parking situation
    """
    # Initialize the map
    m = folium.Map(
        location=[29.889, -97.942],
        zoom_start=15,
        tiles='cartodbpositron'
    )

    # Create feature groups for different layers
    dorms_group = folium.FeatureGroup(name='Dorms')
    parking_group = folium.FeatureGroup(name='Parking Lots')
    heatmap_group = folium.FeatureGroup(name='Student Density')

    # Add dorms with consolidated data
    for _, dorm in merged_df.iterrows():
        # Get consolidated data if available
        cons_data = consolidated_data[consolidated_data['Name'] == dorm['dorm_name']].iloc[0] if \
                   len(consolidated_data[consolidated_data['Name'] == dorm['dorm_name']]) > 0 else None

        # Calculate color based on desirability score
        color = f'#{int(255 * (1-dorm["desirability_score"])):02x}{int(255 * dorm["desirability_score"]):02x}00'

        # Create popup content
        popup_html = f"""
            <div style="font-family: Arial; min-width: 200px;">
                <h4 style="margin: 0;">{dorm['dorm_name']}</h4>
                <hr style="margin: 5px 0;">
                <p style="margin: 5px 0;">
                    <b>Students:</b> {int(dorm['student_count'])}<br>
                    <b>Desirability Score:</b> {dorm['desirability_score']:.3f}<br>
        """

        if cons_data is not None:
            popup_html += f"""
                    <b>Bed Capacity:</b> {int(cons_data['Bed Capacity'])}<br>
                    <b>Parking Spaces:</b> {int(cons_data['Spaces']) if not pd.isna(cons_data['Spaces']) else 'N/A'}<br>
                    <b>Current Usage:</b> {int(cons_data['Counter']) if not pd.isna(cons_data['Counter']) else 'N/A'}
            """

        popup_html += """
                </p>
            </div>
        """

        # Calculate marker size based on student count
        radius = np.sqrt(dorm['student_count']) / 3

        # Add dorm marker
        folium.CircleMarker(
            location=[dorm['latitude'], dorm['longitude']],
            radius=radius,
            color='black',
            weight=2,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            popup=folium.Popup(popup_html, max_width=300),
            tooltip=dorm['dorm_name']
        ).add_to(dorms_group)

    # Add existing parking lots
    for _, lot in green_lots_df.iterrows():
        folium.CircleMarker(
            location=[lot['Latitude'], lot['Longitude']],
            radius=8,
            color='blue',
            fill=True,
            fill_opacity=0.6,
            popup=f"""
                <div style="font-family: Arial">
                    <b>{lot['Location']}</b><br>
                    Spaces: {lot['Spaces']}
                </div>
            """,
            tooltip=f"Parking: {lot['Location']}"
        ).add_to(parking_group)

    # Add heatmap of student density
    heat_data = merged_df[['latitude', 'longitude', 'student_count']].values.tolist()
    plugins.HeatMap(heat_data).add_to(heatmap_group)

    # Add all groups to map
    dorms_group.add_to(m)
    parking_group.add_to(m)
    heatmap_group.add_to(m)

    # Add layer control
    folium.LayerControl().add_to(m)

    # Add legend
    legend_html = """
        <div style="position: fixed; bottom: 50px; right: 50px; width: 200px;
                    z-index:9999; background-color: white; padding: 10px; border: 2px solid gray;">
            <h4 style="margin-top: 0;">Legend</h4>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: #00ff00; margin-right: 5px;"></span>
                High Desirability Dorm
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: #ff0000; margin-right: 5px;"></span>
                Low Desirability Dorm
            </div>
            <div style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; border-radius: 50%;
                           background-color: blue; margin-right: 5px;"></span>
                Existing Parking
            </div>
            <div style="font-size: 12px; margin-top: 10px;">
                • Circle size indicates student population<br>
                • Color indicates desirability score<br>
                • Heatmap shows student density
            </div>
        </div>
    """
    m.get_root().html.add_child(folium.Element(legend_html))

    return m

# Load consolidated data
consolidated_data = pd.read_excel('ConsolidatedData.xlsx')

# Create and save visualization
campus_map = create_parking_visualization(merged_df, consolidated_data, green_lots_df)
campus_map.save('campus_parking_analysis.html')

In [65]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
from openai import OpenAI
import os


def get_system_explanation(client: OpenAI) -> str:
    """
    Use DeepSeek to generate a clear explanation of the dorm assignment system
    """
    system_description = """
    The dorm assignment system handles student housing allocation with the following key features:
    1. Manages bed capacity with a 95% maximum occupancy rule
    2. Handles parking space allocation
    3. Processes student preferences in order
    4. Tracks real-time occupancy and parking usage

    Please explain how this system benefits both students and housing administrators.
    """

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specializing in explaining housing systems"},
            {"role": "user", "content": system_description}
        ],
        stream=False
    )

    return response.choices[0].message.content

def initialize_system(consolidated_file: str) -> pd.DataFrame:
    """
    Load consolidated data and add tracking columns
    """
    df = pd.read_excel(consolidated_file)
    df = df.rename(columns={
        'Name': 'dorm_name',
        'Bed Capacity': 'total_beds',
        'Spaces': 'total_parking',
        'Counter': 'used_parking'
    })
    df['occupied_beds'] = 0
    return df

def check_availability(dorm_data: pd.DataFrame, dorm_name: str, needs_parking: bool) -> bool:
    """
    Check if a dorm has available space and parking if needed
    """
    try:
        dorm = dorm_data[dorm_data['dorm_name'] == dorm_name].iloc[0]
    except IndexError:
        print(f"Warning: Dorm {dorm_name} not found in data")
        return False

    # Check bed availability (95% capacity rule)
    available_beds = dorm['total_beds'] - dorm['occupied_beds']
    if (dorm['occupied_beds'] / dorm['total_beds']) >= 0.95:
        print(f"Dorm {dorm_name} is at or above 95% capacity")
        return False

    # Check parking if needed
    if needs_parking:
        available_parking = dorm['total_parking'] - dorm['used_parking']
        if available_parking <= 0:
            print(f"No parking available at {dorm_name}")
            return False

    return True

def assign_student(dorm_data: pd.DataFrame, dorm_name: str, needs_parking: bool) -> bool:
    """
    Attempt to assign a student to a dorm
    """
    if check_availability(dorm_data, dorm_name, needs_parking):
        idx = dorm_data[dorm_data['dorm_name'] == dorm_name].index[0]
        dorm_data.loc[idx, 'occupied_beds'] += 1
        if needs_parking:
            dorm_data.loc[idx, 'used_parking'] += 1
        return True
    return False

def process_application(dorm_data: pd.DataFrame,
                       preferences: List[str],
                       needs_parking: bool) -> Tuple[str, bool]:
    """
    Process a student's application
    """
    for dorm in preferences:
        if assign_student(dorm_data, dorm, needs_parking):
            print(f"\nAssigned to {dorm}")
            print(f"Current state of {dorm}:")
            dorm_info = dorm_data[dorm_data['dorm_name'] == dorm].iloc[0]
            print(f"Beds: {dorm_info['occupied_beds']}/{dorm_info['total_beds']}")
            print(f"Parking: {dorm_info['used_parking']}/{dorm_info['total_parking']}")
            return dorm, True

    return None, False

def explain_assignment_decision(client: OpenAI,
                              student_prefs: List[str],
                              assigned_dorm: str,
                              needs_parking: bool) -> str:
    """
    Use DeepSeek to explain why a particular assignment was made
    """
    prompt = f"""
    A student with the following preferences: {', '.join(student_prefs)}
    {'and requiring parking ' if needs_parking else 'not requiring parking '}
    was assigned to {assigned_dorm if assigned_dorm else 'no dorm'}.

    Please explain the factors that led to this assignment decision.
    """

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a helpful assistant explaining housing decisions"},
            {"role": "user", "content": prompt}
        ],
        stream=False
    )

    return response.choices[0].message.content

if __name__ == "__main__":
    # Initialize DeepSeek client
    client = OpenAI(
        api_key=userdata.get("DEEPSEEK_API_KEY"),
        base_url="https://api.deepseek.com"
    )

    # Get system explanation
    print("\nSystem Overview:")
    print(get_system_explanation(client))

    # Load data
    dorm_data = initialize_system('ConsolidatedData.xlsx')

    # Example student applications
    students = [
        {
            'preferences': ['San Marcos', 'Butler'],
            'needs_parking': True
        },
        {
            'preferences': ['San Marcos', 'Butler'],
            'needs_parking': True
        },
        {
            'preferences': ['San Marcos'],
            'needs_parking': False
        }
    ]

    # Process applications with explanations
    for i, student in enumerate(students, 1):
        print(f"\nProcessing Student {i}:")
        assigned_dorm, success = process_application(
            dorm_data,
            student['preferences'],
            student['needs_parking']
        )

        # Get explanation for the assignment decision
        explanation = explain_assignment_decision(
            client,
            student['preferences'],
            assigned_dorm,
            student['needs_parking']
        )
        print("\nAssignment Explanation:")
        print(explanation)

    # Display final state
    print("\nFinal State:")
    print(dorm_data[['dorm_name', 'total_beds', 'occupied_beds', 'total_parking', 'used_parking']])


System Overview:
The dorm assignment system with the described features offers several benefits to both students and housing administrators by streamlining the housing allocation process and ensuring efficient resource management. Here's how it benefits each group:

### Benefits for Students:
1. **Fair and Transparent Allocation**: By processing student preferences in order, the system ensures that housing assignments are made fairly and transparently. Students who submit their preferences earlier or meet certain criteria have a better chance of securing their desired housing option, which enhances satisfaction and reduces frustration.

2. **Improved Housing Experience**: The system's ability to manage bed capacity with a 95% maximum occupancy rule ensures that dormitories are not overcrowded. This leads to a more comfortable living environment, as students have adequate space and resources, contributing to their overall well-being and academic performance.

3. **Convenient Parking Al

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [64]:
from openai import OpenAI
import pandas as pd
import json
api_key=userdata.get('DEEPSEEK_API_KEY')

def analyze_parking_situation(merged_df: pd.DataFrame, consolidated_data: pd.DataFrame, api_key: str) -> str:
    """
    Use DeepSeek to analyze parking situation and get recommendations
    """
    # Create OpenAI client with DeepSeek endpoint
    client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

    # Prepare data summary for analysis
    dorm_summary = merged_df.merge(
        consolidated_data,
        left_on='dorm_name',
        right_on='Name',
        how='left'
    )

    # Create analysis prompt
    data_summary = {
        "dorm_stats": {
            "total_dorms": len(dorm_summary),
            "total_students": int(dorm_summary['student_count'].sum()),
            "avg_desirability": float(dorm_summary['desirability_score'].mean()),
            "total_parking_spaces": int(consolidated_data['Spaces'].sum())
        },
        "high_demand_dorms": dorm_summary.nlargest(5, 'desirability_score')[
            ['dorm_name', 'desirability_score', 'student_count', 'Spaces']
        ].to_dict('records'),
        "clusters": dorm_summary.groupby('cluster')['student_count'].sum().to_dict()
    }

    analysis_prompt = f"""
    As a parking infrastructure analyst, analyze this university campus parking situation:

    Data Summary:
    {json.dumps(data_summary, indent=2)}

    Consider:
    1. Current parking distribution vs student density
    2. Areas with high desirability scores but limited parking
    3. Cluster-based analysis for optimal parking placement
    4. Impact on student accessibility and campus traffic flow

    Provide:
    1. Analysis of current parking distribution
    2. Top 3 recommended locations for new parking facilities
    3. Justification for each recommendation
    4. Potential challenges and mitigation strategies
    5. Expected impact on campus parking situation
    """

    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a parking infrastructure analyst with expertise in university campus planning."},
                {"role": "user", "content": analysis_prompt}
            ],
            stream=False
        )

        analysis = response.choices[0].message.content

        # Save analysis to file
        with open('parking_analysis_report.txt', 'w') as f:
            f.write(analysis)

        return analysis

    except Exception as e:
        return f"Error getting analysis: {str(e)}"

def get_detailed_recommendation(location: dict, api_key: str) -> str:
    """
    Get detailed analysis for a specific recommended location
    """
    client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

    prompt = f"""
    Analyze this potential parking location:
    Location Details:
    {json.dumps(location, indent=2)}

    Provide:
    1. Specific advantages of this location
    2. Potential construction considerations
    3. Impact on traffic flow
    4. Cost-benefit analysis factors
    5. Implementation recommendations
    """

    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a parking infrastructure analyst."},
                {"role": "user", "content": prompt}
            ],
            stream=False
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error getting recommendation details: {str(e)}"

# Usage example:

# Get overall analysis
analysis = analyze_parking_situation(merged_df, consolidated_data, api_key)
print("Overall Analysis:")
print(analysis)

# Get detailed recommendation for a specific location
location_details = {
    "location": "Near San Jacinto Hall",
    "nearby_dorms": ["San Jacinto", "Tower"],
    "current_parking": 200,
    "student_density": 1500,
    "desirability_score": 0.85
}
detailed_analysis = get_detailed_recommendation(location_details, api_key)
print("\nDetailed Location Analysis:")
print(detailed_analysis)


Overall Analysis:
### Analysis of Current Parking Distribution

1. **Parking Distribution vs. Student Density**:
   - The campus has **6,772 parking spaces** for **13,069 students**, resulting in a **parking-to-student ratio of approximately 0.52**. This indicates a significant shortage of parking spaces, as only about half of the students can be accommodated.
   - High-demand dorms like **Blanco** (1,556 students, 168 spaces) and **San Jacinto** (925 students, 397 spaces) have particularly low parking availability relative to student density. Blanco has a **parking-to-student ratio of 0.11**, while San Jacinto has a ratio of **0.43**.
   - **Sterry Hall** (737 students) has no recorded parking spaces, which is a critical issue given its high desirability score (0.594).

2. **High Desirability Areas with Limited Parking**:
   - **Blanco** and **Sterry Hall** stand out as high-desirability dorms with severe parking shortages. Blanco’s desirability score (0.613) and Sterry Hall’s score (