<a href="https://colab.research.google.com/github/RutikaH/SAProject2025/blob/main/saproject2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pathway bokeh

Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import datetime
import pathway as pw
import bokeh.plotting


Reading and Understanding data


In [None]:
df=pd.read_csv('dataset.csv')
df.head(20)

In [None]:
df.columns

In [None]:
df['SystemCodeNumber'].unique()

In [None]:
df.isna().sum()

feature addition and tuning

In [None]:
df['OccupancyRate'] = df['Occupancy'] / df['Capacity']
# keeping all time related data in a column
df['Timestamp'] = pd.to_datetime(df['LastUpdatedDate'] + ' ' + df['LastUpdatedTime'],
                                  format='%d-%m-%Y %H:%M:%S')

# sorting according to time give a clear picture in a particular time
df = df.sort_values('Timestamp').reset_index(drop=True)

making categorical values into numerical values

In [None]:
df['TrafficConditionNearby'].unique()

In [None]:
traffic_map = {'low': 1, 'average': 2, 'high': 3}
df['TrafficConditionNearby'] = df['TrafficConditionNearby'].map(traffic_map)
#mapping the traffic conditions to numbers

In [None]:
df['VehicleType'] = df['VehicleType'].replace({'car': 1, 'bike': 0.5, 'truck': 1.5,'cycle':2})
#replacing vehicle types into numbers

In [None]:
#to reduce the impact of sudden spikes or drops i calculated rolling avg here we chose for 3 hour span
df['OccupancyRateRollingAvg'] = df.groupby('ID')['OccupancyRate'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())


In [None]:
df.head()

analyzing data through graphs

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(df['Timestamp'], df['OccupancyRate'], marker='o')
plt.xlabel('Time')
plt.ylabel('Occupancy Rate')
plt.title('Occupancy Rate Over Time')
plt.grid(True)

In [None]:
plt.figure(figsize=(8, 5))
plt.scatter(df['QueueLength'], df['OccupancyRate'], alpha=0.5)
plt.xlabel('Queue Length')
plt.ylabel('Occupancy Rate')
plt.title('Queue Length vs. Occupancy Rate')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
df.boxplot(column='OccupancyRate', by='TrafficConditionNearby')
plt.xlabel('Traffic Condition Nearby')
plt.ylabel('Occupancy Rate')
plt.title('Occupancy Rate by Traffic Condition')
plt.suptitle('')
plt.show()


In [None]:
import matplotlib.pyplot as plt
df.groupby('VehicleType')['OccupancyRate'].mean().plot(kind='bar')
plt.ylabel('Average Occupancy Rate')
plt.title('Occupancy Rate by Vehicle Type')
plt.show()


pathway pipeline for model 1

In [None]:
#saving the df to a csv file for streaming
df.to_csv('preprocessed_dataset.csv',index=False)

In [None]:
#defining the schema for streaming data using pathway(specifies the expected structure of each data row in stream)
class ParkingSchema(pw.Schema):
  Timestamp: str
  Capacity: float
  Occupancy: float
  QueueLength:float

In [None]:
#loading data as a simulated steam using replay_csv function,this replays the data at controlled input rate to mimic realtimestreaming
data=pw.demo.replay_csv("preprocessed_dataset.csv",schema=ParkingSchema,input_rate=107)

In [None]:
# Define the datetime format to parse the 'Timestamp' column
fmt = "%Y-%m-%d %H:%M:%S"

# Add new columns to the data stream:
# - 't' contains the parsed full datetime
# - 'day' extracts the date part and resets the time to midnight (useful for day-level aggregations)
data_with_time = data.with_columns(
    t = data.Timestamp.dt.strptime(fmt),
    hour= data.Timestamp.dt.strptime(fmt).dt.strftime("%Y-%m-%dT00:00:00")
)

In [None]:
import datetime
#creating hourly tumbling window for aggegrating data for every hour
#selected hourly tumbling window by studying industial practices
hourly_tumbling_window=(
    data_with_time.windowby(
        pw.this.t,# used the timestamp column for windowing
        instance=pw.this.hour,#groupby hour instance
        window=pw.temporal.tumbling(datetime.timedelta(hours=1)),
        behavior=pw.temporal.exactly_once_behavior()
         # creating 1 hour non overlapping windows,exactly once processing semantics



    )
    .reduce(
        t=pw.this._pw_window_end,# end of time of each window
        occ_max=pw.reducers.max(pw.this.Occupancy),

        cap=pw.reducers.max(pw.this.Capacity),

        avg_queue_length=pw.reducers.avg(pw.this.QueueLength),
        #getting max occupancy,max capacity,avg queue length during that hour
    )

)

#parameters for tuning the model
alpha = 1.0
beta = 0.5
base_price = 10 #given

# Calculation of dynamic price for each hourly window
hourly_window = hourly_tumbling_window.with_columns(
    price=base_price+
        alpha * (pw.this.occ_max/pw.this.cap) +
        beta * pw.this.avg_queue_length


)

pw.run()

In [None]:
pw.io.csv.write(hourly_window, "hourly_window.csv") #exporting results of pathway pipeline to a csv
pw.run()

output_df = pd.read_csv("hourly_window.csv")


In [None]:
output_df['t'] = pd.to_datetime(output_df['t'])
#all entries in 't' will be in a standardized datetime format, enabling accurate time-based operations,plotting.



visualizing through bokeh plotting

In [None]:
# importing the necessary things for bokeh plotting
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource

output_notebook()
#specifying source
source = ColumnDataSource(output_df)

p = figure(x_axis_type='datetime'#to check if the timebased data is properly formatted on x axis
, title='Hourly Dynamic Parking Prices')
p.line(x='t', y='price', source=source, line_width=2, color='blue', legend_label='Price')
p.xaxis.axis_label = 'Time'
p.yaxis.axis_label = 'Price'

show(p)


MODEL 2

In [None]:
df2=df.copy()

For Geo proximity

In [None]:
citycenter_lat=df2['Latitude'].median()
citycenter_lon=df2['Longitude'].median()

In [None]:
print (citycenter_lat,citycenter_lon)

We use the Haversine formula to calculate the distance from each parking lot to the city centre (approximated as the median coordinates of all lots). This distance is then converted into a geo proximity score, where lots closer to the centre receive higher scores, reflecting their greater desirability and pricing power.



In [None]:
import numpy as np
#for geo proximity i checked different methods for calculating distance from coordinates. I found haversine method in online and as we dont have info about city centre i chose median of the given data as the coordinates of city center.

def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371000
    #converting degrees to radians
    lat1_rad, lon1_rad = np.radians(lat1), np.radians(lon1)
    lat2_rad, lon2_rad = np.radians(lat2), np.radians(lon2)
    #calculating difference in radians
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c
    #a measures squared chord length between points
    #c converts chord length to angular distance
    #R*c converts angular distance to real world distance

CITY_CENTER_LAT = 26.1475
CITY_CENTER_LON = 91.7295

df2['DistanceToCenter'] = haversine_np(
    df2['Latitude'], df2['Longitude'],
    CITY_CENTER_LAT, CITY_CENTER_LON
)
df2['GeoProximityScore'] = 1 / (df2['DistanceToCenter']+1)

pathway pipeline for model 2(very similar to model 1)

In [None]:
df2.to_csv('datasetmodel2.csv',index=False)

In [None]:
class DynamicpricingSchema(pw.Schema):
  Timestamp: str
  OccupancyRateRollingAvg: float
  OccupancyRate: float
  Capacity: float
  Occupancy: float
  QueueLength:float
  GeoProximityScore:float
  TrafficConditionNearby:float
  VehicleType:float
  IsSpecialDay:float

In [None]:
data2=pw.demo.replay_csv("datasetmodel2.csv",schema=DynamicpricingSchema,input_rate=107)

In [None]:
fmt="%Y-%m-%d %H:%M:%S"
data_with_time_formodel2=data2.with_columns(
    t2=data2.Timestamp.dt.strptime(fmt),
    hour2=data2.Timestamp.dt.strptime(fmt).dt.strftime("%Y-%m-%dT00:00:00")
)

In [None]:
import datetime

hourly_tumbling_window_model2= (
     data_with_time_formodel2.windowby(
        pw.this.t2,
        instance=pw.this.hour2,
        window=pw.temporal.tumbling(datetime.timedelta(hours=1)),
        behavior=pw.temporal.exactly_once_behavior()
     )
     .reduce(
         t2=pw.this._pw_window_end,
         occ_rate=pw.reducers.max(pw.this.OccupancyRate),
         traffic_nearby=pw.reducers.max(pw.this.TrafficConditionNearby),
         is_special_day=pw.reducers.max(pw.this.IsSpecialDay),
         geo_proximity_score=pw.reducers.max(pw.this.GeoProximityScore),
         vehicle_type=pw.reducers.max(pw.this.VehicleType),
         occupancy_rate_rolling_avg=pw.reducers.max(pw.this.OccupancyRateRollingAvg),
         avg_queue_length=pw.reducers.avg(pw.this.QueueLength),
     )
 )

Tunable parameters for demand score

In [None]:
alpha1 = 1.0   # Occupancy rate
beta1 = 0.5    # Queue length
gamma1 = 0.3   # Traffic nearby
delta1 = 1.5  # Special day
epsilon1 = 1.0 # Occupancy rate rolling avg
zeta1 = 1.0   # Geo proximity score
theta1 = 0.5   # Vehicle type
base_price1 = 10
lambda_1 = 0.5 # Demand sensitivity

In [None]:
hourly_window_model2 = hourly_tumbling_window_model2.with_columns(
    demand_score = (
        alpha1 * pw.this.occ_rate +
        beta1 * pw.this.avg_queue_length -
        gamma1 * pw.this.traffic_nearby +
        delta1 * pw.this.is_special_day +
        epsilon1 * pw.this.occupancy_rate_rolling_avg +
        zeta1 * pw.this.geo_proximity_score +
        theta1 * pw.this.vehicle_type
    )
)

hourly_window_model2 = hourly_window_model2.with_columns(
    Model2Price = base_price1 + lambda_1 * pw.this.demand_score
)

# explanation for code is similar to model 1

DEMAND SCORE EXPLANATION

The demand score represents the overall demand for a parking lot at a given
time and is computed using multiple contextual and behavioral factors:

1. Distance from City Center
   Parking lots closer to the city center typically experience higher demand
   due to increased accessibility and activity density.

2. Queue Length
   A longer queue indicates higher waiting time, directly reflecting increased
   demand for the parking lot.

3. Nearby Traffic Intensity
   Higher traffic density in surrounding areas increases the likelihood of
   vehicles searching for parking, thereby raising demand.

4. Special Day Indicator
   Events, holidays, or special occasions lead to temporary spikes in parking
   demand.

5. Rolling Average of Occupancy
   Captures recent demand trends and smooths short-term fluctuations, enabling
   more stable and reliable pricing decisions.

6. Vehicle Type
   Different vehicle categories are priced differently to account for space
   usage and demand variation across vehicle types.

These factors are combined to generate a demand-aware pricing signal that
balances responsiveness with pricing stability.


ASSUMPTIONS

- Certain parameters used in the demand and pricing models are assumed and
  remain tunable.
- Hourly aggregation is adopted based on common industry practices for
  real-time pricing systems.
- Due to the absence of real-world competitor data, competitor prices are
  simulated for evaluating competitive pricing behavior.


In [None]:
pw.io.csv.write(hourly_window_model2, "hourly_window_model2.csv")
pw.run()
output = pd.read_csv("hourly_window_model2.csv",delimiter=',', quotechar='"')

In [None]:
output['t2'] = pd.to_datetime(output['t2'], errors='coerce')


In [None]:


# Apply the clipping to the Model2Price column to avoid erratic spikes and downs
output['Model2Price'] = output['Model2Price'].clip(lower=0.5 * base_price1, upper=2 * base_price1)

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
output['t2'] = pd.to_datetime(output['t2'])


output_notebook()

source = ColumnDataSource(output)

p = figure(
    x_axis_type='datetime',
    title='Dynamic Parking Price: Model 2',

)
p.line('t2', 'Model2Price', source=source, line_width=2, color='red', legend_label='Model 2 Price')
p.xaxis.axis_label = 'Time'
p.yaxis.axis_label = 'Price'
p.legend.location = 'top_left'
p.legend.click_policy = 'hide'

show(p)


camparision plots for model 1 and 2

In [None]:
import pandas as pd

# Convert time columns to datetime
output_df['t'] = pd.to_datetime(output_df['t'])
output['t2'] = pd.to_datetime(output['t2'])

# Merge on the time column (rename for consistency)
output_model1 = output_df[['t', 'price']].rename(columns={'t': 'Time', 'price': 'Model1Price'})
output_model2 = output[['t2', 'Model2Price']].rename(columns={'t2': 'Time', 'Model2Price': 'Model2Price'})

# Merge DataFrames on 'Time'
comparison_df = pd.merge(output_model1, output_model2, on='Time', how='inner')


In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource

output_notebook()

source = ColumnDataSource(comparison_df)

p = figure(
    x_axis_type='datetime',
    title='Dynamic Parking Price: Model 1 vs Model 2',

)
p.line('Time', 'Model1Price', source=source, line_width=2, color='blue', legend_label='Model 1 Price')
p.line('Time', 'Model2Price', source=source, line_width=2, color='red', legend_label='Model 2 Price')
p.xaxis.axis_label = 'Time'
p.yaxis.axis_label = 'Price'
p.legend.location = 'top_left'

show(p)


competitor pricing

In [None]:
df3=df2.copy()

In [None]:
import numpy as np
np.random.seed(42)

# Simulation of competitor prices as ±10% variation around model2 price
df3 = comparison_df.copy()
df3['CompetitorPrice'] = df3['Model2Price'] * np.random.uniform(0.9, 1.1, size=len(df3))

In [None]:
df3['CompetitivePrice'] = np.where(
df3['Model2Price'] > df3['CompetitorPrice'],
df3['CompetitorPrice'] - 0.5,
df3['Model2Price']
)

COMPETITIVE PRICING

Since real-world competitor pricing data was unavailable, competitive prices
were simulated using a simple and controlled approach.

The Model 2 price is used as a reference, and a random variation of ±10% is
applied to generate a simulated competitor price. This variation mimics
realistic market fluctuations without introducing excessive volatility.

A competitive adjustment rule is then applied:
- If the system’s price is higher than the simulated competitor price,
  the price is adjusted to remain slightly lower than the competitor.
- If the system’s price is already lower, it remains unchanged.

This logic ensures market-aware pricing behavior while maintaining stability
and preventing aggressive or unrealistic price changes.


In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource

output_notebook()
source = ColumnDataSource(df3)

p = figure(x_axis_type='datetime', title='Your Price vs. Competitor Price')
p.line('Time', 'Model2Price', source=source, color='red', legend_label='Your Price', line_width=2)
p.line('Time', 'CompetitorPrice', source=source, color='brown', legend_label='Competitor Price', line_width=2, line_dash='dashed')
p.line('Time', 'CompetitivePrice', source=source, color='green', legend_label='Final Competitive Price', line_width=2, line_dash='dotdash')
p.xaxis.axis_label = 'Time'
p.yaxis.axis_label = 'Price'
p.legend.location = 'top_left'
show(p)


SUMMARY

In [None]:
import pandas as pd
summary = pd.DataFrame({
    'Model 1': [
        comparison_df['Model1Price'].mean(),
        comparison_df['Model1Price'].min(),
        comparison_df['Model1Price'].max(),
        comparison_df['Model1Price'].std()
    ],
    'Model 2': [
        comparison_df['Model2Price'].mean(),
        comparison_df['Model2Price'].min(),
        comparison_df['Model2Price'].max(),
        comparison_df['Model2Price'].std()
    ],
    'Competitor': [
        df3['CompetitorPrice'].mean(),
        df3['CompetitorPrice'].min(),
        df3['CompetitorPrice'].max(),
        df3['CompetitorPrice'].std()
    ],
    'Competitive Price': [
        df3['CompetitivePrice'].mean(),
        df3['CompetitivePrice'].min(),
        df3['CompetitivePrice'].max(),
        df3['CompetitivePrice'].std()
    ]


}, index=['Mean', 'Min', 'Max', 'Std Dev'])

print(summary)


SUMMARY

Model 1 is a simple and interpretable pricing model in which the parking price
increases linearly with occupancy rate and queue length. It serves as a
baseline to capture direct demand-driven pricing behavior.  


Model 2 is a feature-rich pricing model that incorporates multiple demand and
contextual signals, including geo-proximity and simulated competitive prices.
The model is bounded and normalized to ensure smooth, stable pricing behavior,
reflecting both technical soundness and business relevance.

