# **Loading libraries**



In [None]:
%pip install gdown
%pip install tqdm scikit-learn
%pip install geopandas
%pip install geohash2
%pip install folium


import pandas as pd
import plotly.express as px
import gdown
import os
import plotly.graph_objects as go
import numpy as np
from scipy.stats import gaussian_kde
from google.colab import drive
from math import radians, sin, cos, sqrt, atan2
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree
from tqdm import tqdm
import geohash2
from sklearn.cluster import KMeans



import folium
from folium.plugins import MarkerCluster

from scipy.stats import chi2_contingency






In [None]:
drive.mount('/content/drive')


# **Loading the data**



---
downloading the dataset
---



In [None]:
folder_id = '1O3w5OKnS__hzlL8kTSfGCUc_iX8XNjEN'
output_dir = 'Homework'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Attempting to download content from folder ID: {folder_id} into {output_dir}")
try:
    gdown.download_folder(id=folder_id, output=output_dir, quiet=False, use_cookies=False)
    print(f"\nSuccessfully downloaded content to: /content/{output_dir}")
    print("You can now find the downloaded content in the 'downloaded_external_folder' directory in your Colab files browser.")
except Exception as e:
    print(f"\nAn error occurred during download: {e}")
    print("Please ensure the Google Drive folder is publicly accessible or shared with 'Anyone with the link can view'.")

In [None]:

# Load tabular data
weather_df = pd.read_csv("Homework/data/Washington,DC,USA 2024-01-01 to 2024-12-31.csv")
trips_df = pd.read_parquet('Homework/data/daily-rent.parquet')

# Load spatial parking zones
parking_zones_gdf = gpd.read_file('Homework/data/Residential_and_Visitor_Parking_Zones.geojson')

stations_df = pd.read_csv("Homework/data/Capital_Bikeshare_Locations.csv")
# Load spatial parking zones
parking_zones_gdf = gpd.read_file('Homework/data/Residential_and_Visitor_Parking_Zones.geojson')




---
Downloading the combined and modified dataset (for ease of use )
---



In [None]:
# download the data straight
import gdown as gdown
# file_id = "1eOCLRqXFnzvIz4I3S2uk0STHCk_Eg3pP"
file_id ="114g7JYuZ00i864przAIJQYymib_5h6Qa"
output_file = "trips_df.csv"
gdown.download(id=file_id, output=output_file, quiet=False)
num_rows_to_read = 1_000_000

print(f"File downloaded to {output_file}")
trips_df = pd.read_csv(output_file,nrows=num_rows_to_read)
trips_df.head()

In [None]:
trips_df.columns

# **Cleaning & inspecting the data**


There is a problem with missing start/id , almost 20% of the data are null so we must find a way to fill these up

**Try1 : spatial join**


---


using lang and lati we can match it to the nearest station and then assign this id

In [None]:
trips_df = trips_df.dropna(subset=['end_lat', 'end_lng'])

trips_df_cleaned=trips_df.drop_duplicates()
trips_df_cleaned.isna().sum()

In [None]:
# EPSG:4326 = lat/lon
trips_gdf = gpd.GeoDataFrame(
    trips_df,
    geometry=gpd.points_from_xy(trips_df['start_lng'], trips_df['start_lat']),
    crs='EPSG:4326'
)

stations_gdf = gpd.GeoDataFrame(
    stations_df,
    geometry=gpd.points_from_xy(stations_df['LONGITUDE'], stations_df['LATITUDE']),
    crs='EPSG:4326'
)
stations_gdf.head(5)

In [None]:
# Find nearest station to each ride
trips_with_nearest_station = gpd.sjoin_nearest(
    trips_gdf, stations_gdf[['STATION_ID', 'geometry']],
    how="left", distance_col="distance"
)

# Now we fill missing station_id with nearest one
trips_df['start_station_id'] = trips_df['start_station_id'].fillna(
    trips_with_nearest_station['STATION_ID']
)
# Create a mapping from STATION_ID to STATION_NAME
id_to_name = stations_df.set_index('STATION_ID')['NAME'].to_dict()

# Fill in missing start_station_name using start_station_id
trips_df['start_station_name'] = trips_df['start_station_name'].fillna(
    trips_df['start_station_id'].map(id_to_name)
)
trips_df_cleaned=trips_df.drop_duplicates()
trips_df_cleaned.isna().sum()

Repeating the process to end id and name

In [None]:
trips_gdf_end = gpd.GeoDataFrame(
    trips_df,
    geometry=gpd.points_from_xy(trips_df['end_lng'], trips_df['end_lat']),
    crs='EPSG:4326'
)

trips_with_nearest_end_station = gpd.sjoin_nearest(
    trips_gdf_end, stations_gdf[['STATION_ID', 'geometry']],
    how="left", distance_col="end_distance"
)

trips_df['end_station_id'] = trips_df['end_station_id'].fillna(
    trips_with_nearest_end_station['STATION_ID']
)
trips_df['end_station_name'] = trips_df['end_station_name'].fillna(
    trips_df['end_station_id'].map(id_to_name)
)
trips_df=trips_df.drop_duplicates()
trips_df.isna().sum()

we will continue inspecting the rest of the data

In [None]:
stations_df.head()

In [None]:
stations_df=stations_df.drop_duplicates()
stations_df.isna().sum()  # we dont need to drop null values

In [None]:
weather_df=weather_df.drop_duplicates()
weather_df.isna().sum()

In [None]:
parking_zones_gdf.head()

In [None]:
parking_zones_gdf=parking_zones_gdf.drop_duplicates()
parking_zones_gdf.isna().sum()

In [None]:
parking_zones_gdf = parking_zones_gdf.drop(columns=['CREATOR', 'CREATED','EDITOR','EDITED'])


In [None]:
parking_zones_gdf.head(2)

In [None]:
parking_zones_gdf=parking_zones_gdf.drop_duplicates()
parking_zones_gdf.isna().sum()

# **PreProcessing the data**

In [None]:
weather_df.head(2)

In [None]:
# first we make sure all the dates are in the same format (by checking the length)
datetime_lengths = weather_df["datetime"].astype(str).apply(len)
print(datetime_lengths.value_counts())
weather_df["date"] = pd.to_datetime(weather_df["datetime"])
print(weather_df["date"].dtype)


In [None]:
trips_df["start_time"] = pd.to_datetime(trips_df["started_at"],format='mixed')
trips_df["end_time"] = pd.to_datetime(trips_df["ended_at"],format='mixed')
# ensuring that CRS is EPSG:4326
if parking_zones_gdf.crs != "EPSG:4326":
    parking_zones_gdf = parking_zones_gdf.to_crs("EPSG:4326")
# Spatial Join to Map Stations to Parking Zones
# Spatial join: add zone info to each station
stations_with_zone = gpd.sjoin(
    stations_gdf,
    parking_zones_gdf[["NAME", "geometry"]],
    how="left",
    predicate="within"
)
# Rename column for clarity
stations_with_zone = stations_with_zone.rename(columns={"zone_name": "residential_zone"})
# Joining Weather Data
# Extract date from start_time for weather join
trips_df["date"] = trips_df["start_time"].dt.date
weather_df["date"] = weather_df["date"].dt.date

# Join weather by date
trips_df = trips_df.merge(weather_df, on="date", how="left")



In [None]:
trips_df.columns

In [None]:
trips_df[['start_station_id', 'end_station_id', 'start_station_name', 'end_station_name']].isnull().sum()


In [None]:
trips_df.head(2)


---
B1
---


In [None]:
# # B1

# # From started_at
# trips_df['start_year'] = trips_df['started_at'].dt.year
# trips_df['start_month'] = trips_df['started_at'].dt.month
# trips_df['start_day_num'] = trips_df['started_at'].dt.day
# trips_df['start_day_name'] = trips_df['started_at'].dt.day_name()

# # From ended_at
# trips_df['end_year'] = trips_df['ended_at'].dt.year
# trips_df['end_month'] = trips_df['ended_at'].dt.month
# trips_df['end_day_num'] = trips_df['ended_at'].dt.day
# trips_df['end_day_name'] = trips_df['ended_at'].dt.day_name()
# trips_df.head(5)



---
B2
---


In [None]:
trips_df['trip_duration_minutes'] = (trips_df['end_time'] - trips_df['start_time']).dt.total_seconds() / 60
trips_df['trip_duration_minutes']=trips_df['trip_duration_minutes'].round(2)
trips_df['trip_duration_minutes'].head(5)

**The trip_duration_minutes problem**

In [None]:
trips_df['trip_duration_minutes'].describe()

*we can clearly see that there is a problem with the tripd_durations, the min is a negative value and that is not right*

In [None]:
# Show trips with negative or 0 duration
invalid_durations = trips_df[trips_df['trip_duration_minutes'] <= 0]
print(f"Invalid rows: {len(invalid_durations)}")
invalid_durations[['ride_id', 'started_at', 'ended_at', 'trip_duration_minutes']].head()


In [None]:
# Filter only valid trips
trips_df = trips_df[trips_df['trip_duration_minutes'] > 0]
trips_df['trip_duration_minutes'].describe()


---
B3
---


In [None]:
trips_df['member_casual'].value_counts()

In [None]:
trips_df['rideable_type'].value_counts()

In [None]:
# Initialize base cost
# Start with 0 cost
trips_df['trip_cost'] = 0.0

# Define fixed costs
trips_df.loc[trips_df['member_casual'] == 'member', 'trip_cost'] = 3.95
trips_df.loc[trips_df['member_casual'] == 'casual', 'trip_cost'] = 1.00

# Add extra cost for duration
# for members :
# Create condition for member rides longer than 45 mins
cond_member_extra = (trips_df['member_casual'] == 'member') & (trips_df['trip_duration_minutes'] > 45)

# Electric bike extra for members
trips_df.loc[cond_member_extra & (trips_df['rideable_type'] == 'electric_bike'), 'trip_cost'] += \
    (trips_df['trip_duration_minutes'] - 45) * 0.10

# Classic bike extra for members
trips_df.loc[cond_member_extra & (trips_df['rideable_type'] == 'classic_bike'), 'trip_cost'] += \
    (trips_df['trip_duration_minutes'] - 45) * 0.05
# Electric bike for casuals
cond_casual_electric = (trips_df['member_casual'] == 'casual') & (trips_df['rideable_type'] == 'electric_bike')
trips_df.loc[cond_casual_electric, 'trip_cost'] += trips_df['trip_duration_minutes'] * 0.15

# Classic bike for casuals
cond_casual_classic = (trips_df['member_casual'] == 'casual') & (trips_df['rideable_type'] == 'classic_bike')
trips_df.loc[cond_casual_classic, 'trip_cost'] += trips_df['trip_duration_minutes'] * 0.05
# Add Central Business District (CBD) fee
# Preparaing your geometry points
# Create GeoDataFrame of start points
trips_df['start_point'] = trips_df.apply(lambda row: Point(row['start_lng'], row['start_lat']), axis=1)
trips_df['end_point'] = trips_df.apply(lambda row: Point(row['end_lng'], row['end_lat']), axis=1)
# #  Load CBD Polygon
CBD = gpd.read_file('Homework/data/DDOT_Central_Business_District.geojson')
CBD = CBD.to_crs(epsg=4326)  # Ensures it's in WGS 84


# Convert to GeoDataFrames with correct CRS
start_gdf = gpd.GeoDataFrame(trips_df, geometry='start_point', crs='EPSG:4326').to_crs('EPSG:6933')
end_gdf = gpd.GeoDataFrame(trips_df, geometry='end_point', crs='EPSG:4326').to_crs('EPSG:6933')

# Load CBD polygon and project to EPSG:6933
CBD = gpd.read_file('Homework/data/DDOT_Central_Business_District.geojson')
CBD = CBD.to_crs(epsg=6933)
cbd_polygon = CBD.geometry.unary_union  # Get full boundary

In [None]:
# # Spatial containment check
# # Get the actual polygon geometry from CBD GeoDataFrame
# cbd_polygon = CBD.geometry.unary_union  # safe in case of multipolygon

# # Check for each row
# trips_df['start_in_cbd'] = trips_df['start_point'].apply(lambda point: point.within(cbd_polygon))
# trips_df['end_in_cbd'] = trips_df['end_point'].apply(lambda point: point.within(cbd_polygon))

# # Final condition: start or end inside CBD
# trips_df['in_cbd'] = trips_df['start_in_cbd'] | trips_df['end_in_cbd']
# Check spatial containment in EPSG:6933
trips_df['start_in_cbd'] = start_gdf['start_point'].apply(lambda point: point.within(cbd_polygon))
trips_df['end_in_cbd'] = end_gdf['end_point'].apply(lambda point: point.within(cbd_polygon))

# Final condition and cost update
trips_df['in_cbd'] = trips_df['start_in_cbd'] | trips_df['end_in_cbd']
trips_df.loc[trips_df['in_cbd'], 'trip_cost'] += 0.5
trips_df['trip_cost'].head()



In [None]:
trips_df['trip_cost'].describe()

*we can see a clear issue in the data ,  and super high values (4.3 mil in the max ) and std is very high (4837.62) , so we must identify this outliers and deal with them*

In [None]:
# High-cost trips
high_cost = trips_df[trips_df['trip_cost'] > 1000].copy()
print(high_cost[['ride_id', 'trip_duration_minutes', 'rideable_type', 'member_casual', 'trip_cost']])

# Negative-cost trips
neg_cost = trips_df[trips_df['trip_cost'] < 0].copy()
print(neg_cost[['ride_id', 'trip_duration_minutes', 'rideable_type', 'member_casual', 'trip_cost']])


In [None]:
# Total rows
total_rows = len(trips_df)

# Define thresholds
high_cost_threshold = 10000
negative_cost_threshold = 0

# Find outliers
high_cost_outliers = trips_df[trips_df['trip_cost'] > high_cost_threshold]
negative_cost_outliers = trips_df[trips_df['trip_cost'] < negative_cost_threshold]

# Count
num_high_cost = len(high_cost_outliers)
num_negative_cost = len(negative_cost_outliers)
total_outliers = num_high_cost + num_negative_cost

# Percentages
percent_high_cost = (num_high_cost / total_rows) * 100
percent_negative_cost = (num_negative_cost / total_rows) * 100
percent_total_outliers = (total_outliers / total_rows) * 100

print(f"High cost outliers: {num_high_cost} ({percent_high_cost:.2f}%)")
print(f"Negative cost outliers: {num_negative_cost} ({percent_negative_cost:.2f}%)")
print(f"Total outliers: {total_outliers} ({percent_total_outliers:.2f}%)")


In [None]:
trips_df['trip_cost'].info()

In [None]:
# Drop outliers by reassigning the filtered DataFrame back to df
trips_df = trips_df[(trips_df['trip_cost'] <= high_cost_threshold) & (trips_df['trip_cost'] >= negative_cost_threshold)]
trips_df['trip_cost'].info()


In [None]:
trips_df['trip_cost'].describe()

---
B4
---


In [None]:
stations_df.columns

In [None]:
stations_df['CAPACITY'].describe()

In [None]:

# Basic histogram using Plotly
fig = px.histogram(stations_df, x='CAPACITY', nbins=30, title='Distribution of Station Capacity')
fig.update_layout(xaxis_title='Capacity', yaxis_title='Count', bargap=0.1)
fig.show(config={'staticPlot': True})


In [None]:
# Drop NaNs
capacity_data = stations_df['CAPACITY'].dropna()
# Histogram
hist_data = go.Histogram(x=capacity_data, nbinsx=30, name='Histogram', opacity=0.6)
# Density Curve
kde = gaussian_kde(capacity_data)
x_vals = np.linspace(capacity_data.min(), capacity_data.max(), 1000)
kde_data = go.Scatter(x=x_vals, y=kde(x_vals) * len(capacity_data) * (x_vals[1] - x_vals[0]),
                      mode='lines', name='KDE Curve')

# Plot both
fig = go.Figure(data=[hist_data, kde_data])
fig.update_layout(title='Capacity Distribution with KDE',
                  xaxis_title='Capacity', yaxis_title='Count')
# Example thresholds
low_thresh = stations_df['CAPACITY'].quantile(0.30)
high_thresh = stations_df['CAPACITY'].quantile(0.66)
print(low_thresh,high_thresh)
fig.add_vline(x=low_thresh, line_dash="dash", line_color="green", annotation_text="Small/Average")
fig.add_vline(x=high_thresh, line_dash="dash", line_color="red", annotation_text="Average/Large")

fig.show(config={'staticPlot': True})


In [None]:
# Calculate the thresholds
low_thresh = stations_df['CAPACITY'].quantile(0.33)
high_thresh = stations_df['CAPACITY'].quantile(0.66)

def classify_capacity(cap):
    if cap <= low_thresh:
        return 'Small'
    elif cap <= high_thresh:
        return 'Average'
    else:
        return 'Large'

stations_df['STATION_SIZE'] = stations_df['CAPACITY'].apply(classify_capacity)
stations_df['STATION_SIZE'].value_counts()



In [None]:
def classify_capacity(cap):
    if cap <= 15:
        return 'Small'
    elif cap <= 25:
        return 'Average'
    else:
        return 'Large'

stations_df['STATION_SIZE'] = stations_df['CAPACITY'].apply(classify_capacity)
print(stations_df['STATION_SIZE'].value_counts())


In [None]:

fig = px.histogram(stations_df, x='CAPACITY', nbins=30, title='Station Capacity Distribution')
fig.add_vline(x=15, line_dash="dash", line_color="green", annotation_text="Small/Average")
fig.add_vline(x=25, line_dash="dash", line_color="red", annotation_text="Average/Large")

fig.show(config={'staticPlot': True})


---
B5
---



In [None]:
Shuttle_Bus_Stops=pd.read_csv("Homework/data/Shuttle_Bus_Stops.csv")
Metro_Bus_Stops =pd.read_csv("Homework/data/Metro_Bus_Stops.csv")
Shuttle_Bus_Stops.isna().sum()

In [None]:
Metro_Bus_Stops['BSTP_LAT'].isna().sum()


Approaches


---


| Approach                    | Time Complexity | Vectorized | Fast    |
| --------------------------- | --------------- | ---------- | ------- |
| Brute Force (Your original) | O(N × M)        | ❌ No       | 🐌 Slow |
| BallTree (New)              | O(N log M)      | ✅ Yes      | ⚡ Fast  |


Project all your coordinates to EPSG:6933


In [None]:

# Create start and end point geometries
trips_df['start_point'] = trips_df.apply(lambda row: Point(row['start_lng'], row['start_lat']), axis=1)
trips_df['end_point'] = trips_df.apply(lambda row: Point(row['end_lng'], row['end_lat']), axis=1)

# Create GeoDataFrames
gdf_start = gpd.GeoDataFrame(trips_df, geometry='start_point', crs='EPSG:4326').to_crs(epsg=6933)
gdf_end = gpd.GeoDataFrame(trips_df, geometry='end_point', crs='EPSG:4326').to_crs(epsg=6933)

# Add x/y columns
trips_df['start_x'] = gdf_start.geometry.x
trips_df['start_y'] = gdf_start.geometry.y
trips_df['end_x'] = gdf_end.geometry.x
trips_df['end_y'] = gdf_end.geometry.y


# projecting   metro and shuttle station coordinates:

# Convert station lat/lng to projected coordinates
def project_coords(coords_list):
    gdf = gpd.GeoDataFrame(geometry=[Point(lon, lat) for lat, lon in coords_list], crs='EPSG:4326')
    gdf = gdf.to_crs(epsg=6933)
    return np.array([(geom.x, geom.y) for geom in gdf.geometry])


In [None]:
# coords
# Metro stop coordinates
metro_coords = Metro_Bus_Stops[['BSTP_LAT', 'BSTP_LON']].dropna().values

# Shuttle stop coordinates
shuttle_coords = Shuttle_Bus_Stops[['LATITUDE', 'LONGITUDE']].dropna().values

metro_coords_projected = project_coords(metro_coords)
shuttle_coords_projected = project_coords(shuttle_coords)

In [None]:

def euclidean_tree_batch(source_df, stop_coords, x_col, y_col, batch_size=10000):
    tree = BallTree(stop_coords, metric='euclidean')

    distances = []
    n = len(source_df)
    tqdm.pandas(desc=f"Computing distances for {x_col}")

    for i in tqdm(range(0, n, batch_size), desc="Batch processing", unit="batch"):
        batch = source_df.iloc[i:i+batch_size]
        batch_points = batch[[x_col, y_col]].values

        dists, _ = tree.query(batch_points, k=1)
        distances.extend(dists.flatten().tolist())

    return distances


In [None]:
# Start → Metro
trips_df['start_nearest_metro_distance'] = euclidean_tree_batch(
    trips_df, metro_coords_projected, 'start_x', 'start_y'
)

# End → Metro
trips_df['end_nearest_metro_distance'] = euclidean_tree_batch(
    trips_df, metro_coords_projected, 'end_x', 'end_y'
)

# Start → Shuttle
trips_df['start_nearest_shuttle_distance'] = euclidean_tree_batch(
    trips_df, shuttle_coords_projected, 'start_x', 'start_y'
)

# End → Shuttle
trips_df['end_nearest_shuttle_distance'] = euclidean_tree_batch(
    trips_df, shuttle_coords_projected, 'end_x', 'end_y'
)


In [None]:
trips_df['start_nearest_metro_distance'].describe()

In [None]:
trips_df['end_nearest_metro_distance'].describe()

In [None]:
trips_df['start_nearest_shuttle_distance'].describe()

In [None]:
trips_df['end_nearest_shuttle_distance'].describe()

we will drop outliers

In [None]:
start_nearest_metro_distance_thr=1550
end_nearest_metro_distance_thr=1600
start_nearest_shuttle_distance_thr=23000
end_nearest_shuttle_distance_thr=23200
outliers=[]
outliers.append(trips_df[trips_df['start_nearest_metro_distance'] > start_nearest_metro_distance_thr])
outliers.append(trips_df[trips_df['end_nearest_metro_distance'] > end_nearest_metro_distance_thr])
outliers.append(trips_df[trips_df['start_nearest_shuttle_distance'] > start_nearest_shuttle_distance_thr])
outliers.append(trips_df[trips_df['end_nearest_shuttle_distance'] > start_nearest_shuttle_distance_thr])
for i in outliers :
  print("Outliers:", len(i))



In [None]:
trips_df = trips_df[
    (trips_df['start_nearest_metro_distance'] < start_nearest_metro_distance_thr) &
    (trips_df['end_nearest_metro_distance'] < end_nearest_metro_distance_thr) &
    (trips_df['start_nearest_shuttle_distance'] < start_nearest_shuttle_distance_thr) &
    (trips_df['end_nearest_shuttle_distance'] < start_nearest_shuttle_distance_thr)
]

In [None]:
sampled_df = trips_df.sample(n=20000, random_state=50)


cols = ['start_nearest_metro_distance', 'end_nearest_metro_distance',
        'start_nearest_shuttle_distance', 'end_nearest_shuttle_distance']

for col in cols:
    fig = go.Figure(
        data=[go.Histogram(
            x=sampled_df[col],
            nbinsx=100,
            marker=dict(color='skyblue'),
            opacity=0.75
        )]
    )

    fig.update_layout(
        title=col,
        xaxis_title=col,
        yaxis_title='Count (Log Scale)',
        yaxis_type='log',
        bargap=0.1,
        width=800,
        height=400
    )
    fig.show(config={'staticPlot':True})


---
B6
---


In [None]:
print(trips_df['start_point'].iloc[0], type(trips_df['start_point'].iloc[0]))
print(trips_df['end_point'].iloc[0], type(trips_df['end_point'].iloc[0]))
print(type(cbd_polygon))


In [None]:
# STEP 0: Make sure the CBD polygon is projected correctly
CBD = CBD.to_crs(epsg=6933)
cbd_polygon = CBD.geometry.iloc[0]  # assuming a single polygon
# STEP 1: Create a GeoDataFrame from the trip points (start and end)
# start_gdf = gpd.GeoDataFrame(trips_df, geometry=trips_df['start_point'], crs="EPSG:4326")
# end_gdf   = gpd.GeoDataFrame(trips_df, geometry=trips_df['end_point'], crs="EPSG:4326")

# Rebuild the point geometries from lat/lng in EPSG:4326
start_gdf = gpd.GeoDataFrame(
    trips_df,
    geometry=gpd.points_from_xy(trips_df['start_lng'], trips_df['start_lat']),
    crs="EPSG:4326"
)

end_gdf = gpd.GeoDataFrame(
    trips_df,
    geometry=gpd.points_from_xy(trips_df['end_lng'], trips_df['end_lat']),
    crs="EPSG:4326"
)


# Project everything to EPSG:6933
CBD = CBD.to_crs(epsg=6933)
start_gdf = start_gdf.to_crs(epsg=6933)
end_gdf = end_gdf.to_crs(epsg=6933)

# CBD polygon (in same projection)
cbd_polygon = CBD.geometry.unary_union
# Check containment
trips_df['start_in_cbd'] = start_gdf['geometry'].apply(lambda pt: cbd_polygon.contains(pt))
trips_df['end_in_cbd']   = end_gdf['geometry'].apply(lambda pt: cbd_polygon.contains(pt))

# Final result
trips_df['in_cbd'] = trips_df['start_in_cbd'] | trips_df['end_in_cbd']
trips_df['in_cbd'].value_counts()

---
B7
---


In [None]:
# --- Step 1: Compute the CBD centroid (already in EPSG:6933)
cbd_centroid = cbd_polygon.centroid  # geometry in meters (EPSG:6933)

# --- Step 2: Recreate end point GeoDataFrame and project to EPSG:6933
end_gdf = gpd.GeoDataFrame(
    trips_df,
    geometry=gpd.points_from_xy(trips_df['end_lng'], trips_df['end_lat']),
    crs="EPSG:4326"
).to_crs(epsg=6933)

# --- Step 3: Compute Euclidean distance in meters
trips_df['distance_to_cbd_m'] = end_gdf.geometry.distance(cbd_centroid)

# --- Step 4: Set distance to None where start AND end are in the CBD
mask = trips_df['start_in_cbd'] & trips_df['end_in_cbd']
trips_df.loc[mask, 'distance_to_cbd_m'] = None

# --- Step 5: Inspect result
trips_df['distance_to_cbd_m'].describe()




**Threasholding strategies**





elbow method

In [None]:
sampled_df = trips_df.sample(n=20000, random_state=50)

# Extract the data
data = sampled_df['distance_to_cbd_m'].dropna()

# Create histogram trace
hist = go.Histogram(
    x=data,
    nbinsx=100,
    name='Histogram',
    marker_color='lightblue',
    opacity=0.75
)

# Create KDE line (manual since Plotly doesn’t support KDE directly)
kde = gaussian_kde(data)
x_vals = np.linspace(data.min(), data.max(), 1000)
kde_vals = kde(x_vals) * len(data) * (x_vals[1] - x_vals[0])  # scale to match histogram

kde_trace = go.Scatter(
    x=x_vals,
    y=kde_vals,
    mode='lines',
    name='KDE',
    line=dict(color='darkblue')
)

# Vertical reference lines
vline1 = go.Scatter(
    x=[2000, 2000],
    y=[0, max(kde_vals)],
    mode='lines',
    name='2km Threshold',
    line=dict(color='red', dash='dash')
)

vline2 = go.Scatter(
    x=[2764, 2764],
    y=[0, max(kde_vals)],
    mode='lines',
    name='Median',
    line=dict(color='green', dash='dash')
)

# Create the figure
fig = go.Figure(data=[hist, kde_trace, vline1, vline2])

# Update layout
fig.update_layout(
    title='Distance to CBD at End of Trip',
    xaxis_title='distance_to_cbd_m',
    yaxis_title='Count',
    width=800,
    height=500,
    legend=dict(x=0.7, y=0.95)
)

fig.show( config={'staticPlot':True})


In [None]:
"""
i will choose this beacause looking at the histogram we can see the counts drops
"""
threshold = 2764
# Apply binary classification
trips_df['close_to_cbd'] = trips_df['distance_to_cbd_m'].apply(
    lambda d: None if pd.isna(d) else d <= threshold
)
trips_df['close_to_cbd'].value_counts()

In [None]:
print(trips_df['close_to_cbd'].isna().sum())

---
B8
---




In [None]:
"""
Washington, D.C. is roughly:

~16 km (north-south)

~13 km (east-west)

So, a geohash precision of 5–8 is appropriate.
"""
def encode_geohashes(df, lat_col, lon_col, precisions):
    for p in precisions:
        col_name = f'geohash_p{p}'
        df[col_name] = df.apply(lambda row: geohash2.encode(row[lat_col], row[lon_col], p), axis=1)
    return df

# Try precisions from 5 to 8
precisions_to_test = [5, 6, 7, 8]
trips_df = encode_geohashes(trips_df, 'start_lat', 'start_lng', precisions_to_test)
for p in precisions_to_test:
    print(f"Precision {p}: {trips_df[f'geohash_p{p}'].nunique()} unique regions")
"""
If the number is too small → you're over-aggregating.

If it's too big (e.g. thousands) → too fine → hard to summarize meaningfully.
"""

for p in precisions_to_test:
    counts = trips_df[f'geohash_p{p}'].value_counts()
    print(f"Precision {p} → median trips per geohash: {counts.median()}")
"""
This tells you how balanced the spatial bins are.

You ideally want 50–500 trips per cell.
"""

| Precision | Median Trips per Geohash | Interpretation                                                     |
| --------- | ------------------------ | ------------------------------------------------------------------ |
| **5**     | 1761                     | ⚠️ Too coarse — merges many neighborhoods into one.                |
| **6**     | 196                      | ✅ Good balance — each area has enough trips for reliable analysis. |
| **7**     | 7                        | ⚠️ Very fine — may be too sparse for most practical summaries.     |
| **8**     | 2                        | 🚫 Too sparse — most areas will be noise or empty.                 |


In [None]:
# we will choose 6
trips_df['geohash_sector'] = trips_df['geohash_p6']


---

B9
---


In [None]:
# Group by Sector and Date
# Assume you have a 'date' column (convert if needed)
trips_df['date'] = pd.to_datetime(trips_df['date'])

# Count trips per day per sector
daily_counts = trips_df.groupby(['geohash_p6', 'date']).size().reset_index(name='trip_count')

# Now compute average daily trips per geohash sector
avg_daily_trips = daily_counts.groupby('geohash_p6')['trip_count'].mean().reset_index()
avg_daily_trips.rename(columns={'trip_count': 'avg_daily_trips'}, inplace=True)


Choose Segmentation Method (for Red / Yellow / Gray)


| Method                         | Description                          | Pros             | Use Case             |
| ------------------------------ | ------------------------------------ | ---------------- | -------------------- |
| **Quantiles** (e.g., tertiles) | Divide into 3 equal-sized groups     | Simple, fair     | Balanced datasets    |
| **Natural Breaks (Jenks)**     | Optimize separation between clusters | Data-aware       | Uneven distributions |
| **KMeans Clustering (k=3)**    | Machine learning-based segmentation  | Optimal grouping | Large datasets       |


In [None]:
# quantiles :
# Assign labels based on quantiles
quantiles = avg_daily_trips['avg_daily_trips'].quantile([1/3, 2/3])
low_thresh = quantiles.iloc[0]
high_thresh = quantiles.iloc[1]

def classify_volume(val):
    if val < low_thresh:
        return 'gray'   # Low volume
    elif val < high_thresh:
        return 'yellow' # Medium volume
    else:
        return 'red'    # High volume

avg_daily_trips['volume_segment'] = avg_daily_trips['avg_daily_trips'].apply(classify_volume)


In [None]:

# Extract the data
data = avg_daily_trips['avg_daily_trips'].dropna()

# Histogram trace
hist = go.Histogram(
    x=data,
    nbinsx=30,
    marker_color='lightblue',
    opacity=0.75,
    name='Avg Daily Trips'
)

# Vertical threshold lines
vline_low = go.Scatter(
    x=[low_thresh, low_thresh],
    y=[0, data.value_counts().max()],
    mode='lines',
    name='Low Threshold',
    line=dict(color='gray', dash='dash')
)

vline_high = go.Scatter(
    x=[high_thresh, high_thresh],
    y=[0, data.value_counts().max()],
    mode='lines',
    name='High Threshold',
    line=dict(color='orange', dash='dash')
)

# Combine into figure
fig = go.Figure(data=[hist, vline_low, vline_high])

# Update layout
fig.update_layout(
    title='Distribution of Avg Daily Trips per Geohash Sector',
    xaxis_title='Avg Daily Trips',
    yaxis_title='Count',
    width=800,
    height=500,
    bargap=0.1
)

fig.show(config={'staticPlot':True})


In [None]:
X = avg_daily_trips[['avg_daily_trips']].values

kmeans = KMeans(n_clusters=3, random_state=42).fit(X)
avg_daily_trips['kmeans_label'] = kmeans.labels_

# Map to red/yellow/gray using sorted cluster means
label_map = dict(zip(
    np.argsort(kmeans.cluster_centers_.flatten()),
    ['gray', 'yellow', 'red']
))
avg_daily_trips['kmeans_segment'] = avg_daily_trips['kmeans_label'].map(label_map)


In [None]:
avg_daily_trips.head()

In [None]:
trips_df['geohash_p6'].nunique()

In [None]:
# Merge segments into trips_df
trips_df = trips_df.merge(
    avg_daily_trips[['geohash_p6','volume_segment','kmeans_segment']],
    on='geohash_p6',
    how='left'
)


In [None]:
trips_df.columns

In [None]:
comparison = pd.crosstab(avg_daily_trips['volume_segment'], avg_daily_trips['kmeans_segment'])
comparison


In [None]:
trips_df['kmeans_segment'].value_counts()


In [None]:
trips_df['volume_segment'].value_counts()



---

B10
----

In [None]:
trips_df['conditions'].value_counts()

In [None]:
def classify_weather(condition):
    condition = condition.lower()  # lowercase for safety
    if 'rain' in condition or 'snow' in condition:
        return 'rainy'
    elif 'overcast' in condition or 'cloudy' in condition:
        return 'cloudy'
    elif 'clear' in condition:
        return 'sunny'
    else:
        return 'unknown'

# Apply binning
trips_df['weather_segment'] = trips_df['conditions'].apply(classify_weather)
trips_df['weather_segment'].value_counts()

---

B11
---

In [None]:
sorted_ended_at_df = trips_df[['ended_at']].sort_values(by='ended_at')
print("--- Sorted 'ended_at' DataFrame (first 5 rows) ---")
print(sorted_ended_at_df.head())
print("\n")

# --- Step 3: Find the earliest and latest dates ---
earliest_date = sorted_ended_at_df['ended_at'].min()
latest_date = sorted_ended_at_df['ended_at'].max()

print(f"The earliest date in 'ended_at' is: {earliest_date}")
print(f"The latest date in 'ended_at' is: {latest_date}")


In [None]:
# Make sure 'ended_at' is datetime
# trips_df['ended_at'] = pd.to_datetime(trips_df['ended_at'])
trips_df['ended_at'] = pd.to_datetime(trips_df['ended_at'], format='mixed', errors='coerce')


# Extract just the date (without time)
trips_df['end_date'] = trips_df['ended_at'].dt.date
daily_income_weather = trips_df.groupby(['end_date', 'weather_segment'])['trip_cost'].sum().reset_index()



In [None]:
# convert
# Make sure end_date is datetime
daily_income_weather['end_date'] = pd.to_datetime(daily_income_weather['end_date'])

fig_long = px.line(
    daily_income_weather,
    x='end_date',
    y='trip_cost',
    color='weather_segment',
    title='Daily Total Trip Cost by Weather Condition (Long Format)',
    labels={'end_date': 'Date', 'trip_cost': 'Total Income', 'weather_segment': 'Weather'}
)

fig_long.update_layout(xaxis_title='Date', yaxis_title='Trip Cost', hovermode='x unified')
fig_long.show(config={'staticPlot':True})


In [None]:
# Pivot to wide format
wide_df = daily_income_weather.pivot(index='end_date', columns='weather_segment', values='trip_cost').fillna(0)
wide_df = wide_df.sort_index()

# Build traces
fig_wide = go.Figure()

for condition in wide_df.columns:
    fig_wide.add_trace(go.Scatter(
        x=wide_df.index,
        y=wide_df[condition],
        mode='lines',
        name=condition
    ))

fig_wide.update_layout(
    title='Daily Total Trip Cost by Weather Condition (Wide Format)',
    xaxis_title='Date',
    yaxis_title='Trip Cost',
    hovermode='x unified',
    template='plotly_white',
    legend_title='Weather'
)

fig_wide.show(config={'staticPlot':True})


Which one is better for our problem  ?
answer here :

---
B12
---

In [None]:
# feature 1 : rush_hour
# Indicates if the ride occurred during typical commuting hours (7–10 AM or 4–7 PM).
trips_df['start_time'] = pd.to_datetime(trips_df['start_time'], errors='coerce')

trips_df['rush_hour'] = (
    trips_df['start_time'].dt.hour.between(7, 10) |
    trips_df['start_time'].dt.hour.between(16, 19)
).astype(int)
trips_df['rush_hour'].value_counts()


In [None]:
# feature 2 : hour_segment
# Categorize ride start times into broader buckets.
def get_hour_segment(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Midday'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

trips_df['hour_segment'] = trips_df['start_time'].dt.hour.apply(get_hour_segment)
trips_df['hour_segment'].value_counts()




In [None]:
# feature 3 : is_weekend
# Helps spot usage patterns on weekends vs weekdays.
trips_df['is_weekend'] = trips_df['start_time'].dt.dayofweek >= 5
trips_df['is_weekend'] = trips_df['is_weekend'].astype(int)
trips_df['is_weekend'].value_counts()



In [None]:
# feature 4 : ride_density_zone
# Based on start location’s proximity to popular stations (e.g., CBD or metro/shuttle stations).
trips_df['ride_density_zone'] = np.where(
    trips_df['start_nearest_metro_distance'] < 0.5, 'High Density', 'Low Density'
)
trips_df['ride_density_zone'].value_counts()


In [None]:


# Save the stations_df DataFrame to a CSV file
# Define the path in your Google Drive
output_path = '/content/drive/My Drive/BikeShare/trips_df_9-6.csv'

# Ensure the directory exists (optional, but good practice)
import os
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame
trips_df.to_csv(output_path, index=False)

print(f"trips_df successfully saved to {output_path}")



---


#**EDA**


---





---
Sampling the data
---



In [None]:
# Full filtered data stats
# print("Full Data:")
# print(trips_df['trip_duration_minutes'].describe())

# Sampled data stats
sampled_df = trips_df.sample(n=20000, random_state=50)
# print("\nSampled Data:")""
# print(sampled_df['trip_duration_minutes'].describe())




---

# A )


---





---

# B)

---





---
Task 1
---



| Method                     | Formula                         | Notes                               |
| -------------------------- | ------------------------------- | ----------------------------------- |
| **Sturges’ Rule**          | `bins = ceil(log2(n) + 1)`      | Good for small to medium-sized data |
| **Freedman–Diaconis Rule** | `bin_width = 2 * IQR / n^(1/3)` | Good for skewed data or outliers    |
| **Square Root Rule**       | `bins = sqrt(n)`                | Simple and often a good baseline    |


In [None]:

# Use the sampled dataframe to avoid memory issues
durations = sampled_df['trip_duration_minutes']

# Freedman–Diaconis rule for bin width
q25, q75 = np.percentile(durations, [25, 75])
iqr = q75 - q25
n = len(durations)
bin_width = 2 * iqr / (n ** (1/3))
bin_count = int(np.ceil((durations.max() - durations.min()) / bin_width))

print(f"Suggested bin count: {bin_count}")

# Static histogram
fig = go.Figure(
    data=[go.Histogram(
        x=durations,
        nbinsx=bin_count,
        marker_color='blue',
        opacity=1.0
    )]
)

fig.update_layout(
    title="Distribution of Trip Duration (in Minutes)",
    xaxis_title="Trip Duration (minutes)",
    yaxis_title="Frequency",
    bargap=0.05,
    template='simple_white'
)

fig.show(config={'staticPlot': True})


test without outliers :

In [None]:
import numpy as np
import plotly.graph_objects as go

# Choose your cutoff (in minutes)
cutoff = 1440  # Modify as needed

# Use the sampled dataframe to avoid memory issues
durations = sampled_df['trip_duration_minutes']

# Freedman–Diaconis rule for bin width
q25, q75 = np.percentile(durations, [25, 75])
iqr = q75 - q25
n = len(durations)
bin_width = 2 * iqr / (n ** (1/3))
bin_count = int(np.ceil((durations.max() - durations.min()) / bin_width))

print(f"Suggested bin count: {bin_count}")

# Create the histogram
fig = go.Figure()

# Histogram of durations
fig.add_trace(go.Histogram(
    x=durations,
    nbinsx=bin_count,
    marker_color='blue',
    opacity=1.0,
    name="Trip Durations"
))

# Vertical cutoff line
fig.add_trace(go.Scatter(
    x=[cutoff, cutoff],
    y=[0, durations.value_counts().max()],
    mode="lines",
    line=dict(color="red", width=2, dash="dash"),
    name=f"Cutoff = {cutoff} min"
))

fig.update_layout(
    title="Distribution of Trip Duration (in Minutes) with Cutoff",
    xaxis_title="Trip Duration (minutes)",
    yaxis_title="Frequency",
    bargap=0.05,
    template='simple_white'
)

fig.show(config={'staticPlot': True})

# Count how many trips exceed the cutoff
sampled_exceed = (sampled_df['trip_duration_minutes'] > cutoff).sum()
full_exceed = (trips_df['trip_duration_minutes'] > cutoff).sum()

print(f"Trips in sampled_df exceeding {cutoff} minutes: {sampled_exceed}")
print(f"Trips in trips_df exceeding {cutoff} minutes: {full_exceed}")




---
Task2
---



In [None]:
# Use the original (not divided) trip durations
durations = sampled_df['trip_duration_minutes']
types = sampled_df['rideable_type']

# Build the box plot grouped by rideable_type
fig = go.Figure()

# Loop through each rideable type and add a box
for bike_type in sampled_df['rideable_type'].unique():
    fig.add_trace(go.Box(
        y=sampled_df[sampled_df['rideable_type'] == bike_type]['trip_duration_minutes'],
        name=bike_type,
        boxpoints='outliers',  # show outliers only
        marker_color='green',
        line_color='black',
        opacity=0.8
    ))

fig.update_layout(
    title="Box Plot of Trip Duration by Rideable Type",
    yaxis_title="Trip Duration (minutes)",
    xaxis_title="Rideable Type",
    template='simple_white'
)

# Render statically to avoid Colab issues
fig.show(config={'staticPlot': True})




---
Task3
---



In [None]:
trips_df['member_casual'].value_counts()

In [None]:
# Use the original (not divided) trip durations
durations = sampled_df['trip_duration_minutes']
types = sampled_df['member_casual']

# Build the box plot grouped by rideable_type
fig = go.Figure()

# Loop through each rideable type and add a box
for bike_type in sampled_df['rideable_type'].unique():
    fig.add_trace(go.Box(
        y=sampled_df[sampled_df['rideable_type'] == bike_type]['trip_duration_minutes'],
        name=bike_type,
        boxpoints='outliers',  # show outliers only
        marker_color='green',
        line_color='black',
        opacity=0.8
    ))

fig.update_layout(
    title="Box Plot of Trip Duration by Rideable Type",
    yaxis_title="Trip Duration (minutes)",
    xaxis_title="Rideable Type",
    template='simple_white'
)

# Render statically to avoid Colab issues
fig.show(config={'staticPlot': True})


dealing with outliers

In [None]:
# Compute IQR
Q1 = sampled_df['trip_duration_minutes'].quantile(0.25)
Q3 = sampled_df['trip_duration_minutes'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {upper_bound}")




---
Task4
---



In [None]:
# Count Trips Longer Than One Day

# Define threshold: 1 day = 1440 minutes
one_day_minutes = 1440

# Filter trips longer than 1 day
long_trips_df = sampled_df[sampled_df['trip_duration_minutes'] > one_day_minutes]
long_sampled_df = sampled_df[sampled_df['trip_duration_minutes'] > one_day_minutes]
# Show how many there are
print(f"Total number of trips longer than 1 day in full data: {len(long_trips_df)}")
print(f"Total number of trips longer than 1 day in sampled data: {len(long_sampled_df)}")




# Combine start and end station counts for long trips


start_counts = long_trips_df['start_station_id'].value_counts()
end_counts = long_trips_df['end_station_id'].value_counts()

# Combine them into a single Series
total_counts = start_counts.add(end_counts, fill_value=0).astype(int)

# Get station info: name and location
stations = sampled_df[['start_station_id', 'start_station_name', 'start_lat', 'start_lng']].drop_duplicates()
stations = stations.rename(columns={
    'start_station_id': 'station_id',
    'start_station_name': 'station_name',
    'start_lat': 'lat',
    'start_lng': 'lng'
})

# Merge with counts
stations['long_trip_count'] = stations['station_id'].map(total_counts).fillna(0).astype(int)

# Filter stations with at least 1 long trip
stations = stations[stations['long_trip_count'] > 0]


In [None]:
stations['long_trip_count']

In [None]:
# Center the map on Washington DC
m = folium.Map(location=[38.9072, -77.0369], zoom_start=12, tiles='cartodbpositron')

# Optional: cluster points
marker_cluster = MarkerCluster().add_to(m)

# Add stations to the map
for _, row in stations.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lng']],
        radius=3 + row['long_trip_count']**0.5,  # scale marker size
        color='darkred',
        fill=True,
        fill_color='crimson',
        fill_opacity=0.7,
        popup=f"{row['station_name']}<br>Trips > 1 day: {row['long_trip_count']}"
    ).add_to(marker_cluster)

# Show the map
m




---

# C)

---





---
Task1
---



In [None]:
len(sampled_df['trip_cost'].unique())

sampled_df['start_time'] = pd.to_datetime(sampled_df['start_time'])

In [None]:
import plotly.express as px

# cost Histogram
fig = px.histogram(sampled_df, x='trip_cost', nbins=141, title='distrupation of trips cost')
fig.show()

# cost Boxplot
fig = px.box(sampled_df, y='trip_cost', title='Boxplot of trips cost')
fig.show()


- نلاحظ ان اغلب الداتا متوزعة بين ال0 - وال10 دولار بكثرة وان القمة بين 3.5 و4 وهذا يدل على انه يوجد الكثير من الناس مشتركة واغلب الرحل لا تتجاوز ال45 دقيقة

- وايضا يوجد قيم اكبر صحيح انها نادرة ولكنها متوزعة وهذا يدل انه يوجد اشخاص تاخذها لمسافات كبيرة ولكنها قليلة  
- غالبا الرحل ذات تكلفة العالية اشخاص غير مشتركين بالاضافة الى انهم قد يكونون مرة واحدة فقط يستخدمون الدراجات ولا يعودون الى استخدامها بعد تجربة الخدمة ورؤية السعر




---
task2
---



In [None]:
fig = px.scatter(sampled_df, x='trip_duration_minutes', y='trip_cost', trendline='ols',title='the realtion between duration and cost')
fig.show()
# lowess', 'rolling', 'ewm', 'expanding', 'ols'



*   النقاط التي قيمتها قريبة من الصفر كوقت هي تمثل الاعضاء التي لديهم اشتراك ولم يتجاوزوا ال45 دقيقة وكما نلاحظ هم كثر
*   ولدينا ثلاث توزعات للنقاط وذلك يعود بسبب الاشتراك او عدمه وحتى مروره بالمنطقة التجارية






---
Task3
---




In [None]:
fig = px.scatter(sampled_df, x='temp', y='trip_cost', color='member_casual',
                 title='cost vs temperatur ')
fig.show()



*   اغلب الرحلات تكون بين 5 درجات وال20 درجة
*   عندما تكون درجة الحرارة فوق ال20 نلاحظ ان عدد الرحلات قليل
* كما نلاحظ اغلب رحل المشتركين الكلفة غالبا اقل من 10 دولار
* نلاحظ ان اغلب الكلف العالية من الغير المشتركين
* لايوجد علاقة واضحة بين درجة الحرارة والتكلفة لكن يمكن الفول ان بين ال 5 -15 يمكن للناس ان تذهب برحلات أطول






---
Task4
---




In [None]:
daily_rev = sampled_df.groupby(sampled_df['start_time'].dt.date)['trip_cost'].sum().reset_index(name='revenue')
fig = px.line(daily_rev, x='start_time', y='revenue', title='daily incomes')
fig.show()

sampled_df['week'] = sampled_df['start_time'].dt.isocalendar().week
weekly_rev = sampled_df.groupby('week')['trip_cost'].sum().reset_index(name='revenue')
fig = px.line(weekly_rev, x='week', y='revenue', title='weekly incomes')
fig.show()



*   بالنسبة للايرادات اليومبة نلاحظ وجود بين هبوط وصعود ومع استمرار الايام نلاحظ زيادة بالدخل ونلاحظ تناوب بين صعود وهبوط في الايام ويعود هذا الامر اتوقع انو شخص يلي بيركب يوم بريح اليوم يلي بعدو

*   لدينا بشهر april هبوط واضح في الربح السبب قد يعود الى عدم وجود داتا كافية في هذا الشهر

* بالنسبة للايرادات الاسبوعية ملاحظ انه بشكل عام الامور نحو زيادة حيث ان هذا التذبذب راح بسبب انو الاسبوعي عطانا الشكل العام بالاسبوع فاصبح خط  اكثر انسيابية

* بشكل عام يوجد مشكلة في شهر april





---
Task5
---




In [None]:
monthly_rev = sampled_df.groupby('start_month')['trip_cost'].mean().reset_index(name='avg_revenue')
fig = px.line(monthly_rev, x='start_month', y='avg_revenue', title='average month income')
fig.show()

* يبين لنا المخطط تطور متوسط تكلفة الرحلة الواحدة خلال شهر الاول كان متوسط تكلفة الرحلة ما يقارب 3.78 دولار مع دخول الشهر الثاني نلاحظ ارتفاع طفيف ويستمر الارتفاع بشكل طفيف حتى الشهر الثالث هذا النمو التدريجي يوحي بان شيئاً ما كان يتغير ببطء وثبات ربما كان المستخدمون يميلون لأخذ رحلات أطول قليلًا، أو أن هناك زيادة طفيفة في استخدام الدراجات ذات التكلفة الأعلى، أو ربما كان هناك تزايد في الرحلات التي تتخطى الحدود الزمنية المجانية للمشتركين وتتحمل رسومًا إضافية. هذه الزيادة، وإن كانت صغيرة، تشير إلى أن قيمة الرحلة الواحدة كانت في ازدياد
* ثم نصل الى شهر الرابع نلاحظ قفزة في متوسط الرحلة الواحدة بشكل ملحوظ حيث وصل ال4 دولار مسجل اعلى متوسط خلال هذه الفترة قد يبدو للحظة ان الامر جيد ولكن مع النظر الى مخطط اليومي والاسبوعي فنحدد شهدنا هبوط في هذا الشهر وقد يعود سبب الهبوط في رفع سعر الرحلة مما ادى الانهيار الخدمة انهياراً كارثياً

* وايضا ممكن هذا الارتفاع اتى بما انه عدد الرحلات الاجمالية في الشهر الرابع قليلة فوجود قيم شاذة او مرتفعة كما شهدنا في مخطط كلف الرحل سيرفع متوسط كلفة الرحلة بهذا الشكل


---

# **الخلاصة**


*   كانت خدمة مشاركة الدراجات تشهد نموًا مستمرًا في إجمالي إيراداتها وفي قيمة الرحلة الواحدة من يناير وحتى منتصف مارس.
*   مع ذلك، في أواخر مارس/أوائل أبريل، قد تكون الشركة تعرضت لحدث جسيم (إما إغلاق، أو تعليق، أو عطل كبير في النظام) أدى إلى توقف شبه كامل لجميع الرحلات والإيرادات, او قد يكون بسبب رفع رسوم الرحلة
* القفزة في متوسط تكلفة الرحلة في أبريل، على الرغم من أنها تبدو إيجابية في هذا الرسم البياني بمفرده، هي في الواقع مجرد انعكاس لحقيقة أن الرحلات القليلة جدًا المتبقية كانت هي الأكثر تكلفة، مما يلقي الضوء على الوضع الكارثي للخدمة في هذا الشهر.

* قد يكون سبب اخذ قرار الشركة برفع انها كانت تحاول رفع الرسوم في الاشهر الاولى ولكن بشكل طفيف وعندما وجدت ان المبيعات تزاد قامت بهذه الرفعة ظنا منها انه اصبح لديها قاعدة جماهيرية كبيرة وان المستخدمين بازدياد لتفاجئ بحصول عكس ذلك تماما
* كل هذه الامور هي مجرد تفسيرات ممكنة

* قد يكون سبب الارتفاع هو وجود تضخم




---

# D)

---





---
Task1
---




In [None]:
#Loading Residential and Visitor Parking Zones
Residential_Visitor_Parking_Zones  = gpd.read_file('Homework/data/Residential_and_Visitor_Parking_Zones.geojson')


In [None]:
# Step 0: Load residential zones GeoDataFrame (assuming it's already loaded)
res_zones = Residential_Visitor_Parking_Zones
res_zones = res_zones.to_crs(epsg=4326)  # make sure it matches trip coordinates

# Step 1: Create GeoDataFrames for start and end points
start_gdf = gpd.GeoDataFrame(
    sampled_df,
    geometry=gpd.points_from_xy(sampled_df['start_lng'], sampled_df['start_lat']),
    crs='EPSG:4326'
)

end_gdf = gpd.GeoDataFrame(
    sampled_df,
    geometry=gpd.points_from_xy(sampled_df['end_lng'], sampled_df['end_lat']),
    crs='EPSG:4326'
)

# Step 2: Spatial join to check which points fall inside residential zones
start_in_res = gpd.sjoin(start_gdf, res_zones, predicate='within', how='inner')
end_in_res = gpd.sjoin(end_gdf, res_zones, predicate='within', how='inner')

# Step 3: Extract lat/lon of trips touching residential zones
res_start_points = start_in_res[['start_lat', 'start_lng']].rename(columns={'start_lat': 'lat', 'start_lng': 'lon'})
res_end_points = end_in_res[['end_lat', 'end_lng']].rename(columns={'end_lat': 'lat', 'end_lng': 'lon'})

# Combine both
res_points = pd.concat([res_start_points, res_end_points], ignore_index=True)

# Step 4: Count total trips that are outside residential zones (neither start nor end matched)
trip_ids_with_res = set(start_in_res['ride_id']).union(set(end_in_res['ride_id']))
non_res_trip_count = sampled_df[~sampled_df['ride_id'].isin(trip_ids_with_res)].shape[0]

# Step 5: Plot heatmap with Plotly
fig = px.density_mapbox(
    res_points,
    lat='lat',
    lon='lon',
    radius=10,
    center=dict(lat=res_points['lat'].mean(), lon=res_points['lon'].mean()),
    zoom=11,
    mapbox_style='carto-positron',
    title='Geographic Heatmap of Trips to Residential Zones'
)

fig.update_layout(
    margin={"r":0,"t":30,"l":0,"b":0},
    dragmode=False
)

fig.show(config={"staticPlot": True})  # disables all interactivity

# Step 6: Print number of trips outside residential zones
print(f"Total number of trips outside residential zones: {non_res_trip_count}")




---
Task2
---



In [None]:
# Step 1: Count trips per geohash sector
geohash_counts = sampled_df['geohash_p6'].value_counts().reset_index()
geohash_counts.columns = ['geohash_p6', 'trip_count']

# Optional: sort alphabetically or by count
geohash_counts = geohash_counts.sort_values(by='trip_count', ascending=False)

# Step 2: Plot
fig = px.bar(
    geohash_counts,
    x='geohash_p6',
    y='trip_count',
    title='Distribution of Trips by Geographic Sector (Geohash_p6)',
    labels={'geohash_p6': 'Geographic Sector', 'trip_count': 'Number of Trips'}
)

# Step 3: Turn off interactivity
fig.show(config={'staticPlot': True})




---
Task3
---



In [None]:

# 1. Distance to CBD
fig1 = px.histogram(
    sampled_df,
    x='distance_to_cbd_m',
    nbins=40,
    title='Distribution of Distance to CBD (m)',
    labels={'distance_to_cbd_m': 'Distance to CBD (meters)'}
)
fig1.show(config={'staticPlot': True})

# 2. Closest Metro Station Distance
fig2 = px.histogram(
    sampled_df,
    x='start_nearest_metro_distance',
    nbins=30,
    title='Distribution of Distance to Nearest Metro Station',
    labels={'start_nearest_metro_distance': 'Distance to Metro (meters)'}
)
fig2.show(config={'staticPlot': True})

# 3. Closest Shuttle Station Distance
fig3 = px.histogram(
    sampled_df,
    x='start_nearest_shuttle_distance',
    nbins=30,
    title='Distribution of Distance to Nearest Shuttle Station',
    labels={'start_nearest_shuttle_distance': 'Distance to Shuttle (meters)'}
)
fig3.show(config={'staticPlot': True})




---
Task4
---



In [None]:

# Categorize trips
def classify_trip(row):
    if row['start_in_cbd'] == 1 and row['end_in_cbd'] == 1:
        return 'Fully in CBD'
    else:
        return 'Outside CBD'

# Apply classification
sampled_df['cbd_trip_type'] = sampled_df.apply(classify_trip, axis=1)

# Count
trip_cbd_counts = sampled_df['cbd_trip_type'].value_counts().reset_index()
trip_cbd_counts.columns = ['Trip Type', 'Count']

# Plot
fig = px.bar(
    trip_cbd_counts,
    x='Trip Type',
    y='Count',
    title='Trips Fully in CBD vs Outside',
    text='Count',
    labels={'Count': 'Number of Trips'}
)

fig.update_traces(textposition='outside')
fig.update_layout(yaxis_title='Number of Trips', xaxis_title='Trip Category')
fig.show(config={'staticPlot': True})


In [None]:
trips_df['cbd_trip_type'] = trips_df.apply(classify_trip, axis=1)
full_trip_cbd_counts = trips_df['cbd_trip_type'].value_counts().reset_index()
full_trip_cbd_counts.columns = ['Trip Type', 'Count']
full_trip_cbd_counts['Percentage'] = (full_trip_cbd_counts['Count'] / full_trip_cbd_counts['Count'].sum()) * 100
full_trip_cbd_counts



---
Task5
---



In [None]:

# Filter trips that passed through CBD
cbd_passed_df = sampled_df[
    (sampled_df['start_in_cbd'] == 1) | (sampled_df['end_in_cbd'] == 1)
]

# Group by rideable_type and member_casual
grouped = cbd_passed_df.groupby(['rideable_type', 'member_casual']).size().reset_index(name='trip_count')

# Plot
fig = px.bar(
    grouped,
    x='rideable_type',
    y='trip_count',
    color='member_casual',
    barmode='group',
    title='Trips That Passed Through CBD by Rideable Type and Membership',
    labels={'trip_count': 'Number of Trips', 'rideable_type': 'Bike Type'}
)

fig.update_layout(
    xaxis_title='Rideable Type',
    yaxis_title='Number of Trips'
)
fig.show(config={'staticPlot': True})


In [None]:
cbd_passed_df_trips_df=trips_df[
    (trips_df['start_in_cbd'] == 1) | (trips_df['end_in_cbd'] == 1)
]

# Group by rideable_type and member_casual
grouped = cbd_passed_df_trips_df.groupby(['rideable_type', 'member_casual']).size().reset_index(name='trip_count')

print(f"Length of cbd_passed_df_trips_df: {len(cbd_passed_df_trips_df)}")
print(f"Length of trips_df: {len(trips_df)}")

percentage = (len(cbd_passed_df_trips_df) / len(trips_df)) * 100
print(f"Percentage of cbd_passed_df_trips_df compared to trips_df: {percentage:.2f}%")



---
Task6
---



In [None]:
# Create a contingency table
# (Counts of each combination)

# Make sure we’re using categorical data
contingency_table = pd.crosstab(trips_df['close_to_cbd'], trips_df['member_casual'])
contingency_table


In [None]:
# Run chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi2 Statistic:", chi2)
print("Degrees of Freedom:", dof)
print("P-value:", p)
# interpretion based on the p-value:

if p < 0.05:
    print("✅ There is a significant correlation between distance to CBD segments and membership type.")
else:
    print("❌ No significant correlation found between distance to CBD segments and membership type.")


| α Value  | Interpretation                                                                |
| -------- | ----------------------------------------------------------------------------- |
| **0.05** | Most common — means you're willing to accept a 5% chance of a false positive. |
| 0.01     | Stricter — used in more critical fields (medicine, etc.).                     |
| 0.10     | Looser — sometimes used in exploratory research.                              |


Member trips are more common outside the CBD (proportionally).

Casual riders are slightly more concentrated inside the CBD, which makes sense:

Casuals may be tourists or occasional users.

Members might be commuting or local residents going to/from suburban areas.




---
# E)
---


---
Task 1
---

In [None]:
sampled_df.columns

In [None]:
sampled_df['rideable_type'].unique()

In [None]:
daily_weather_avg = sampled_df.groupby('date')[['temp', 'humidity', 'windspeed']].mean().reset_index()
daily_weather_avg = daily_weather_avg.rename(columns={
    'temp': 'Average Temperature',
    'humidity': 'Average Humidity',
    'windspeed': 'Average Wind Speed'
})
fig = px.line(
    daily_weather_avg,
    x='date',
    y=['Average Temperature', 'Average Humidity', 'Average Wind Speed'], # List of columns for y-axis
    title='Average Daily Weather Conditions (Temperature, Humidity, Wind Speed)',
    labels={
        'date': 'Date',
        'value': 'Average Value', # Default label for the combined y-axis values
        'variable': 'Metric'     # Default label for the legend (which variable is which line)
    }
)
fig.update_layout(hovermode="x unified") # Enhances hover tooltips for multiple lines
fig.show()



---
Task2
---

In [None]:
daily_weather_cond = sampled_df.groupby('date')['weather_segment'].first().reset_index()

daily_rev = sampled_df.groupby(sampled_df['date'])['trip_cost'].sum().reset_index(name='revenue')

merged_df = pd.merge(daily_rev, daily_weather_cond, on='date', how='left')
fig = px.box(
    merged_df,
    x='weather_segment',  # Categorical variable on x-axis
    y='revenue',      # Numerical variable on y-axis
    title='Daily Revenue by Weather Condition',
    labels={
        'weather_condition': 'Weather Condition',
        'daily_revenue': 'Daily Revenue ($)'
    },
    category_orders={"weather_condition": ["Sunny", "Cloudy", "Rainy"]} # Optional: ensure specific order
)
fig.update_traces(boxpoints='all', jitter=0.3) # Show individual points for more detail

fig.show()



*   نلاحظ بالنسبة للايام الماطرة يقع وسطيا الايرادات عند مايقارب 650 دولار وهو  اقل  متوسط من جميع حالات طقس رغم وجود عدد كبير من الايام ماطرة ما يقارب 55 بالمئة من الأيام هي ماطرة ونلاحظ مدى توسع الصندوق وهذا يشير الى تقلب كبير في الايرادات في الأيام الماطرة وملاحظ هذا حيث لدينا ايام الايرادات تقارب الصفر وبعض متجاوزة الالف واعتقد يعود السبب الى القيم القريبة الى الصفر هي الايام ذو امطار شديدة وهذا منطقي من الصعب عندها ركوب الدراجات اما بالنسبة للقيم العالية وارد ان بعض الايام الممطرة تكون مقبولة وهذا يعود الى بعض انواع المستخدمين التي تستمع في ذلك او بسبب الحاجة العاجلة للدراجة بدل الانتظار
*   نلاحظ بالنسبة للايام الغائمة مرتفع وسطي الايرادات لما يقارب 800 دولار اكثر من الايام الماطرة ونلاحظ انه يوجد استقرار وليس تقلب بالايرادات وايضا الاتجاه الايرادات في ايام الغائمة اما بازدياد او استقرار ونلاحظ قفزات عالية جدا في الايرادات وارد ذلك عند درجات الحرارة المعتدلة اما بالنسبة للقيم المتدنية جدا فهي اما بشهر الرابع او انها كانت ايام عطل

* رغم قلة الايام المشمسة الا اننا نجد ان الناس تتجه لاستخدام الدراجات وهي اعلى متوسط دخل وملاحظ ان الناس في الايام المشمسة تميل الى استخدام الدراجات وقد يعود ذلك بسبب قلة الايام المشمسة الموجود فالناس تحب التعرض للشمس لذلك تفضل عندها استخدام الدراجات بالاضافة ان الجو يكون جيد

* حيث نستنتج تأثير الطقس على سلوك الركاب يظهر بوضوح كيف تؤثر حالة الطقس بشكل مباشر على الإيرادات اليومية، حيث يفضل الناس استخدام الدراجات في الطقس المعتدل والمشمس، مما يؤدي إلى زيادة الإيرادات، في حين أن الأيام الممطرة العكس اقل مستخدمين وايرادات اقل

* لكن وجود القيم الشاذة في جميع الفئات يدل على أن هناك دائمًا بعض الأيام التي لا تتبع النمط العام للطقس، سواء كانت جيدة بشكل استثنائي أو سيئة بشكل استثنائي حيث عندها اتوقع يوجد امور اخرى  تدخل عندها



---
Task3
---

In [None]:
# lowess', 'rolling', 'ewm', 'expanding', 'ols'
# --- Apply Min-Max Normalization to 'daily_revenue' ---
# xi-xmin /xmax-xmin

# min_revenue = daily_rev['revenue'].min()
# max_revenue = daily_rev['revenue'].max()
# daily_rev['normalized_daily_revenue'] = (daily_rev['revenue'] - min_revenue) / (max_revenue - min_revenue)

merg = pd.merge(daily_weather_avg,daily_rev,on='date',how='left')

cols_to_normalize = ['revenue', 'Average Temperature', 'Average Humidity']
for col in cols_to_normalize:
    min_val = merg[col].min()
    max_val = merg[col].max()
    # Avoid division by zero if all values are the same
    if (max_val - min_val) != 0:
        merg[f'normalized_{col}'] = (merg[col] - min_val) / (max_val - min_val)
    else: # If all values are the same, normalized value is 0 (or 1, depends on convention)
        merg[f'normalized_{col}'] = 0.0

fig1 = px.scatter(merg,x='normalized_Average Temperature',y='normalized_revenue',
                 title="relationship between daily income and temperature",trendline='ols',
                 labels={
                   'Temperature': 'normalized_Average Daily Temperature',
                   'daily_revenue': 'Daily Revenue ($)' }
                 )
fig1.show()


fig2 = px.scatter(merg,x='normalized_Average Humidity',y='normalized_revenue',
                 title="relationship between daily income and Humidity",trendline='ols',
                 labels={
                   'Humidity': 'normalized_Average Daily humidity',
                   'daily_revenue': 'Daily Revenue ($)' }
                 )
fig2.show()


* بالنسبة للعلاقة بين الايرادات اليومية ودرجة الحرارة نلاحظ وجود علافة ارتباط خطي ايجابية حيث في درجات الحرارة المنخفضة (-5 - 3) نرى انخفاض في الايرادات ثم مع ازدياد درجة الحرارة نلاحظ انها تزداد الايردات الى ان تصل الى حد معين ثم تبدء بالنزول حيث ازدياد درجة الحرارة الى درجة ما وهي 16 يؤدي ازدياد الايرادات ولكن بعدها نرى ان ازدياد درجة الحرارة سيؤدي الى انخفاض في الايرادات


* بالنسبة للارتباط الخطي بين الايرادات والرطوبة لا يوجد علاقة ارتباط خطي حيث نلاحظ عند رطوبة منخقضة لدينا ايردادات مرتفعة وايرادات ومنخفضة والامر على القيم اي عندما تكون الرطوبة متوسطة او حتى عالية لدينا الايرادات مرات تكون منخفضة ومرات تكون عالية

# Task4

In [None]:
  # 1. Create the Contingency Table
# This table shows the observed frequencies (counts) of each unique combination
# of weather segment and ride type.
# Rows: weather_segment
# Columns: rideable_type
contingency_table = pd.crosstab(sampled_df['weather_segment'], sampled_df['rideable_type'])
print("Contingency Table (Observed Frequencies):")
print(contingency_table)
print("\n" + "="*50 + "\n") # Visual separator in output

# 2. Perform the Chi-Square Test
# The chi2_contingency function performs the statistical calculations.
# It returns four values:
#   - chi2: The calculated Chi-Square statistic.
#   - p_value: The probability value (most important for interpretation).
#   - dof: Degrees of freedom.
#   - expected_frequencies: A 2D array of expected frequencies if the variables were independent.
chi2, p_value, dof, expected_frequencies = chi2_contingency(contingency_table)

print(f"Chi2 Statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Degrees of Freedom: {dof}")
print("\nExpected Frequencies Table:")

# Display the expected frequencies array as a DataFrame for better readability,
# using the same indices (rows) and columns as the observed contingency table.
print(pd.DataFrame(expected_frequencies, index=contingency_table.index, columns=contingency_table.columns))
print("\n" + "="*50 + "\n") # Another visual separator


# 3. Interpret the Results
# Define the significance level (alpha), which is the threshold for comparing the p_value.
# A common alpha level is 0.05 (or 5%).
alpha = 0.05
print("Interpretation of Results:")
if p_value < alpha:
    # If the p-value is less than alpha, we reject the null hypothesis.
    # The null hypothesis (H0) here is: There is no relationship between weather condition and ride type.
    print(f"Since the P-value ({p_value:.4f}) is less than the significance level (alpha = {alpha}),")
    print("we reject the null hypothesis (H0).")
    print("Conclusion: There is strong statistical evidence of a significant relationship between weather condition and ride type.")
    print("In other words, it appears that the distribution of ride types (or bike types) differs depending on the weather condition.")
    print("\n* To understand this relationship further, compare the observed frequencies with the expected frequencies to identify which categories contribute most to the association.")
else:
    # If the p-value is greater than or equal to alpha, we fail to reject the null hypothesis.
    print(f"Since the P-value ({p_value:.4f}) is greater than or equal to the significance level (alpha = {alpha}),")
    print("we fail to reject the null hypothesis (H0).")
    print("Conclusion: There is no sufficient statistical evidence to claim a significant relationship between weather condition and ride type.")
    print("In other words, it appears that the choice of ride type (or bike type) is not significantly affected by the weather condition, or any observed differences could be due to random chance.")


df_plot = contingency_table.reset_index().melt(id_vars='weather_segment', var_name='rideable_type', value_name='Count')

# 2. Draw a Grouped Bar Chart
fig = px.bar(
    df_plot,
    x='weather_segment',  # X-axis will be weather conditions
    y='Count',            # Y-axis will be the number of rides
    color='rideable_type',    # Different bars for each ride type within each weather condition
    barmode='group',      # This makes the bars for each ride_type stand side-by-side
    title='Ride Type Distribution by Weather Condition',
    labels={
        'weather_segment': 'Weather Condition',
        'Count': 'Number of Rides',
        'Ride Type': 'Ride Type'
    },
    category_orders={"weather_segment": ["Sunny", "Cloudy", "Rainy"]} # Optional: ensure specific order
)

fig.update_layout(xaxis_title="Weather Condition", yaxis_title="Number of Rides")
fig.show()
