In [None]:
import pandas as pd
from pathlib import Path

def CleanMonthTrips(Year: int, Month: int, RawBaseUrl: str, ProcessDir: Path):
    MonthStr = f'{Month:02d}'
    CsvUrl =f"{RawBaseUrl}/{Year}/{MonthStr}.csv"
    print(f"Procssing: {CsvUrl}")


    df = pd.read_csv(CsvUrl)
    # Converting sec into min
    df['DurationMin'] = df['duration'] / 60
    df['DurationSec'] = df['duration']

    #Converting datetime into utc format
    df['StartedAtRaw'] =pd.to_datetime(df['started_at'], format='ISO8601', utc=True)
    df['EndedAtRaw'] = pd.to_datetime(df['ended_at'], format='ISO8601', utc=True)

    #Converting raw datetime into oslo timezone
    df['StartedAtOslo'] = df['StartedAtRaw'].dt.tz_convert('Europe/Oslo')
    df['EndedAtOslo'] = df['EndedAtRaw'].dt.tz_convert('Europe/Oslo')

    #geting hour from time
    df["StartHourOslo"] = df['StartedAtOslo'].dt.hour

    # Finding the day
    df['DayOfWeek'] = df['StartedAtOslo'].dt.dayofweek

    # finding either its weekday or weekend using dayofweek data. 0: Monday since its using georgian cal!
    df['IsWeekend'] = df['DayOfWeek'].isin([5,6])

    def getTimeOfTheDay(Hour):
        if 5 <= Hour >= 11:
            return 'Morning'
        elif 12 <= Hour <= 17:
            return "Afternoon"
        elif 18 <= Hour <= 23:
            return "Evening"
        else:
            return 'Night'
    
    df['TimeOfDay'] = df['StartHourOslo'].apply(getTimeOfTheDay)

    df = df.rename(columns={
        "start_station_id": "StartStationId",
        "end_station_id": "EndStationId"
    })

    TripsClean = df[[
        'DurationSec',
        'DurationMin',
        'StartedAtRaw',
        'EndedAtRaw',
        'StartedAtOslo',
        'EndedAtOslo',
        "StartHourOslo",
        "DayOfWeek",
        "IsWeekend",
        "TimeOfDay",
        "StartStationId",
        "EndStationId"
    ]].copy()
    # Checking dir is there or not
    ProcessDir.mkdir(parents=True, exist_ok=True)
    # creating output file
    outName = f"Trips_{Year}_{MonthStr}_Clean.csv"
    outPath = ProcessDir / outName
    TripsClean.to_csv(outPath, index=False)
    print(f"Saved {outPath}")



RawBaseUrl = "https://data.urbansharing.com/oslobysykkel.no/trips/v1"
ProcessDir = Path('../data/processed/')

Months = [5,6,7,8,9,10]

for m in Months:
    CleanMonthTrips(2025, m, RawBaseUrl, ProcessDir)




In [6]:
# To combaine all the csvs together
from pathlib import Path
import pandas as pd

processedDir = Path("../data/processed")

# List of months you’ve already cleaned
months = [5, 6, 7, 8, 9, 10]

files = [
    processedDir / f"Trips_2025_{m:02d}_Clean.csv"
    for m in months
]

# Read each file and store DataFrames in a list
dfList = [pd.read_csv(f) for f in files]

# Combine into one big DataFrame
trips_6m = pd.concat(dfList, ignore_index=True)

trips_6m.info()  # just to see row count & columns

outPath = processedDir/ 'TripsCombainedSixMonth.csv'
trips_6m.to_csv(outPath, index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858967 entries, 0 to 858966
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DurationSec     858967 non-null  int64  
 1   DurationMin     858967 non-null  float64
 2   StartedAtRaw    858967 non-null  object 
 3   EndedAtRaw      858967 non-null  object 
 4   StartedAtOslo   858967 non-null  object 
 5   EndedAtOslo     858967 non-null  object 
 6   StartHourOslo   858967 non-null  int64  
 7   DayOfWeek       858967 non-null  int64  
 8   IsWeekend       858967 non-null  bool   
 9   TimeOfDay       858967 non-null  object 
 10  StartStationId  858967 non-null  int64  
 11  EndStationId    858967 non-null  int64  
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 72.9+ MB


In [7]:
# creating Month column!
trips_6m["StartedAtOslo"] = trips_6m["StartedAtOslo"].astype(str)
trips_6m["Month"] = trips_6m["StartedAtOslo"].str[:7]
trips_6m = trips_6m[trips_6m["Month"].between("2025-05", "2025-10")]
trips_6m["Month"].value_counts().sort_index()

Month
2025-05    143394
2025-06    149126
2025-07    145581
2025-08    176454
2025-09    138418
2025-10    105916
Name: count, dtype: int64

In [8]:
# Getting data for stations

import requests
StationUrl = 'https://gbfs.urbansharing.com/oslobysykkel.no/station_information.json'
Header = {'Client-Identifier': 'Shaon-OsloCityBikeProject'}
Response = requests.get(StationUrl, headers=Header)

Data = Response.json()


In [9]:
StationsRaw = pd.json_normalize(Data['data']['stations'])
StationsRaw.head()
StationsRaw.columns

Index(['station_id', 'name', 'address', 'cross_street', 'lat', 'lon',
       'is_virtual_station', 'capacity', 'station_area.type',
       'station_area.coordinates', 'rental_uris.android', 'rental_uris.ios'],
      dtype='object')

In [10]:
Stations = StationsRaw.rename(columns={
    "station_id": "StationId",
    "name": "StationName",
    "address": "StationAddress",
    "lat": "Latitude",
    "lon": "Longitude",
    "capacity": "Capacity"
}).copy()

Stations = Stations[[
    "StationId",
    "StationName",
    "StationAddress",
    "Latitude",
    "Longitude",
    "Capacity"
]]



In [11]:
# To align them with main data "Join" table we need to change their data type

Stations['StationId'] = Stations['StationId'].astype(int)
Stations.info()

trips_6m['StartStationId'] = trips_6m['StartStationId'].astype(int)
trips_6m['EndStationId'] = trips_6m['EndStationId'].astype(int)

# To save CSV

StationsDataPath = processedDir / 'Stations_2025.csv'
Stations.to_csv(StationsDataPath, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   StationId       252 non-null    int64  
 1   StationName     252 non-null    object 
 2   StationAddress  252 non-null    object 
 3   Latitude        252 non-null    float64
 4   Longitude       252 non-null    float64
 5   Capacity        252 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 11.9+ KB


In [12]:
# Looking for missing data
tripStationIds = set(trips_6m["StartStationId"]).union(set(trips_6m["EndStationId"]))
stationIds     = set(Stations["StationId"])

len(tripStationIds), len(stationIds)
len(tripStationIds - stationIds), len(stationIds - tripStationIds)

(17, 0)

In [None]:
stationIds = set(Stations["StationId"])

mask = (
    trips_6m["StartStationId"].isin(stationIds)
    & trips_6m["EndStationId"].isin(stationIds)
)

TripsClean = trips_6m[mask].copy()

In [14]:
#Sanity check
tripStationIds = set(TripsClean["StartStationId"]).union(set(TripsClean["EndStationId"]))
stationIds     = set(Stations["StationId"])

len(tripStationIds - stationIds), len(stationIds - tripStationIds)

(0, 0)

In [15]:
uniqueStations = Stations[["StationId", "StationName", "Latitude", "Longitude"]].drop_duplicates()
uniqueStations.shape

(252, 4)

In [16]:
import requests
import time

def reverseGeocode(lat, lon):
    url = "https://nominatim.openstreetmap.org/reverse"
    params = {
        "lat": lat,
        "lon": lon,
        "format": "jsonv2",
        "zoom": 16,
        "addressdetails": 1
    }
    headers = {
        "User-Agent": "RifatOsloBikeProject/1.0" 
    }
    resp = requests.get(url, params=params, headers=headers)
    resp.raise_for_status()
    data = resp.json()
    addr = data.get("address", {})
    return (
        addr.get("neighbourhood")
        or addr.get("suburb")
        or addr.get("city_district")
        or addr.get("city")
    )

rawAreas = []

for idx, row in uniqueStations.iterrows():
    lat = row["Latitude"]
    lon = row["Longitude"]
    try:
        areaName = reverseGeocode(lat, lon)
    except Exception as e:
        print(f"Error for StationId {row['StationId']}: {e}")
        areaName = None
    
    rawAreas.append(areaName)
    time.sleep(1)  

uniqueStations["RawAreaName"] = rawAreas
uniqueStations[["StationId", "StationName", "RawAreaName"]].head()
uniqueStations["RawAreaName"].value_counts().head(20)

RawAreaName
Grünerløkka    18
Kvadraturen    16
Frogner        11
Vika           10
Sentrum         9
Majorstuen      8
Grønland        7
Aker brygge     7
Marienlyst      6
Carl Berner     6
Sofienberg      5
Bjølsen         5
Dælenenga       5
Sagene          4
Sjøtomta        4
Sofienlund      4
Tullin          4
Rodeløkka       4
Lovisenberg     4
Enerhaugen      4
Name: count, dtype: int64

In [17]:
Stations = Stations.merge(
    uniqueStations[["StationId", "RawAreaName"]],
    on="StationId",
    how="left"
)


In [19]:
Stations = Stations.rename(columns={'RawAreaName': 'Area'}) 


In [23]:
processedDir = Path("../data/processed")
processedDir.mkdir(parents=True, exist_ok=True)

stationsPath = processedDir / "Stations_WithArea.csv"
Stations.to_csv(stationsPath, index=False)
stationsPath

PosixPath('../data/processed/Stations_WithArea.csv')

In [24]:
tripsWithArea = TripsClean.merge(
    Stations[["StationId", "Area"]],
    left_on="StartStationId",
    right_on="StationId",
    how="left"
)

tripsWithArea[["StartedAtOslo", "StartStationId", "Area", "TimeOfDay"]].head()

Unnamed: 0,StartedAtOslo,StartStationId,Area,TimeOfDay
0,2025-05-01 05:00:18.039000+02:00,583,Galgeberg,Night
1,2025-05-01 05:20:55.936000+02:00,2334,Grünerløkka,Night
2,2025-05-01 05:22:29.190000+02:00,512,Ensjø,Night
3,2025-05-01 05:25:01.463000+02:00,450,Aker brygge,Night
4,2025-05-01 05:37:12.803000+02:00,429,Sofienlund,Night


In [25]:
tripsWithArea["Area"].value_counts(dropna=False).head(20)

Area
Grünerløkka            77979
Kvadraturen            39134
Sentrum                29933
Aker brygge            28431
Frogner                28034
Enerhaugen             27500
Majorstuen             26828
Dælenenga              25029
Bjølsen                22945
Vika                   21833
Grønland               21199
Hausmannskvartalene    20373
Sofienberg             17755
Sagene                 17732
Sjøtomta               17530
Carl Berner            16173
Rodeløkka              15066
Marienlyst             13152
Bjørvika               12611
Ruseløkka              12084
Name: count, dtype: int64

In [None]:
tripsWithArea

In [26]:
# area time with summary

area_time_summary = (
    tripsWithArea
    .groupby(["Area", "TimeOfDay"])
    .agg(
        TripCount   = ("DurationMin", "size"),
        AvgDuration = ("DurationMin", "mean")
    )
    .reset_index()
    .sort_values(["Area", "TripCount"], ascending=[True, False])
)

area_time_summary
                                


Unnamed: 0,Area,TimeOfDay,TripCount,AvgDuration
0,Adamstuen,Morning,2633,12.031219
1,Adamstuen,Night,912,10.692160
2,Aker brygge,Morning,21197,17.371637
3,Aker brygge,Night,7234,10.874270
4,Barcode,Morning,5452,14.585617
...,...,...,...,...
153,Åsen,Night,2218,12.351946
154,Økernly,Morning,796,16.645226
155,Økernly,Night,255,17.011046
156,Øvre Blindern,Morning,6828,13.368898


In [27]:
area_time_weekend = (
    tripsWithArea
    .groupby(["Area", "IsWeekend", "TimeOfDay"])
    .agg(
        TripCount   = ("DurationMin", "size"),
        AvgDuration = ("DurationMin", "mean")
    )
    .reset_index()
)

area_time_weekend.head(20)

Unnamed: 0,Area,IsWeekend,TimeOfDay,TripCount,AvgDuration
0,Adamstuen,False,Morning,1964,11.417015
1,Adamstuen,False,Night,760,10.645197
2,Adamstuen,True,Morning,669,13.834355
3,Adamstuen,True,Night,152,10.926974
4,Aker brygge,False,Morning,15431,16.428949
5,Aker brygge,False,Night,6517,10.288993
6,Aker brygge,True,Morning,5766,19.894465
7,Aker brygge,True,Night,717,16.194003
8,Barcode,False,Morning,4498,13.807796
9,Barcode,False,Night,1196,11.865608


In [None]:
from pathlib import Path
processedDir = Path("/Users/rifat-mac/Study/Data-Analytics-Project/Oslo-city-bike-analytics/data/processed")

trips_sql = TripsClean[[
    "DurationSec",
    "DurationMin",
    "StartedAtOslo",
    "EndedAtOslo",
    "StartHourOslo",
    "DayOfWeek",
    "IsWeekend",
    "TimeOfDay",
    "StartStationId",
    "EndStationId",
    "Month"
]].copy()

trips_sql_path = processedDir / "Trips_SQL_2025_05_10.csv"
trips_sql.to_csv(trips_sql_path, index=False)

stations_sql = Stations[[
    "StationId",
    "StationName",
    "StationAddress",
    "Latitude",
    "Longitude",
    "Capacity",
    "Area"
]].drop_duplicates(subset=["StationId"]).copy()

stations_sql_path = processedDir / "Stations_WithArea_SQL.csv"
stations_sql.to_csv(stations_sql_path, index=False)

trips_sql_path, stations_sql_path