In [19]:
#STEP 1 — Import Libraries
import pandas as pd
import requests
import holidays
from sklearn.preprocessing import StandardScaler

In [20]:
# STEP 2 — Load Traffic Dataset
# Explanation: Load and convert DateTime column.
df = pd.read_csv("C:/Users/om/Downloads/Dataset_Uber Traffic.csv")

df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%m/%y %H:%M')

print("Start:", df['DateTime'].min())
print("End:", df['DateTime'].max())


Start: 2015-11-01 00:00:00
End: 2017-06-30 23:00:00


In [21]:
# STEP 3 — Fetch Weather Data (Open-Meteo)
# Explanation: Collect real hourly historical weather data.
start_date = df['DateTime'].min().strftime("%Y-%m-%d")
end_date = df['DateTime'].max().strftime("%Y-%m-%d")

lat = 19.0948
lon = 74.7480

url = "https://archive-api.open-meteo.com/v1/archive"

params = {
    "latitude": lat,
    "longitude": lon,
    "start_date": start_date,
    "end_date": end_date,
    "hourly": "temperature_2m,precipitation,relativehumidity_2m,windspeed_10m",
    "timezone": "Asia/Kolkata"
}

response = requests.get(url, params=params)
data = response.json()

weather = pd.DataFrame(data['hourly'])

weather.rename(columns={
    'time': 'DateTime',
    'temperature_2m': 'Temperature',
    'precipitation': 'Precipitation',
    'relativehumidity_2m': 'Humidity',
    'windspeed_10m': 'WindSpeed'
}, inplace=True)

weather['DateTime'] = pd.to_datetime(weather['DateTime'])

print("Weather collected successfully!")


Weather collected successfully!


In [22]:
# STEP 4 — Merge Traffic + Weather
# Explanation: Merge on hourly timestamp.
merged_df = df.merge(weather, on='DateTime', how='left')
print("Traffic + Weather merged")
merged_df.to_csv("D:/Uber_Internship/Uber_Traffic_With_Weather.csv", index=False)
print(" Weather merged and saved successfully!")


Traffic + Weather merged
 Weather merged and saved successfully!


In [23]:
# STEP 5 — Create Event Data (All Years)
# Explanation: Generate public holidays automatically.
years = df['DateTime'].dt.year.unique()

india_holidays = holidays.India(years=years)

event_list = []

for date, name in india_holidays.items():
    event_list.append([date, "Public Holiday", name])

events_df = pd.DataFrame(event_list, columns=["Date", "Event_Type", "Event_Name"])
events_df['Date'] = pd.to_datetime(events_df['Date'])


In [24]:
# STEP 6 — Merge Event Data
# Explanation: Merge events at daily level.
merged_df['Date'] = merged_df['DateTime'].dt.date
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

final_df = merged_df.merge(events_df, on='Date', how='left')

final_df['Event_Type'] = final_df['Event_Type'].fillna("No Event")
final_df['Event_Name'] = final_df['Event_Name'].fillna("No Event")

final_df['Event_Flag'] = (final_df['Event_Type'] != "No Event").astype(int)

print("Events merged successfully!")


Events merged successfully!


In [25]:
# ==========================
# Save Updated Dataset
# ==========================

final_df.to_csv("D:/Uber_Internship/Uber_Traffic_With_Weather.csv", index=False)

print("File updated and saved successfully!")


File updated and saved successfully!


In [26]:
# STEP 7 — Handle Data Quality
# Remove duplicates
final_df = final_df.drop_duplicates()

In [27]:
df.shape

(48120, 4)

In [28]:
final_df.shape

(48120, 12)

In [29]:
# Handle missing weather values
# Set DateTime as index for time-based interpolation
final_df = final_df.set_index('DateTime')

weather_cols = ['Temperature','Precipitation','Humidity','WindSpeed']

# Time-based interpolation
final_df[weather_cols] = final_df[weather_cols].interpolate(method='time')

# Fill remaining missing values
final_df[weather_cols] = final_df[weather_cols].bfill().ffill()

# Reset index back
final_df = final_df.reset_index()

print("Missing weather values handled successfully!")


Missing weather values handled successfully!


In [30]:
# STEP 8 — Add Time Features
# Explanation: Create useful time variables.
final_df['Year'] = final_df['DateTime'].dt.year
final_df['Month'] = final_df['DateTime'].dt.month
final_df['Hour'] = final_df['DateTime'].dt.hour
final_df['DayOfWeek'] = final_df['DateTime'].dt.day_name()
final_df['Is_Weekend'] = final_df['DayOfWeek'].isin(['Saturday','Sunday']).astype(int)


In [31]:
# STEP 9 — Standardize Numeric Features
# Explanation: Scale numeric variables for modeling.
scaler = StandardScaler()

numeric_cols = ['Vehicles','Temperature','Precipitation',
                'Humidity','WindSpeed']

final_df[numeric_cols] = scaler.fit_transform(final_df[numeric_cols])


In [32]:
# STEP 10 — Save Final Clean Dataset
# Explanation: Save unified dataset.
final_df.to_csv("D:/Uber_Internship/Uber_Traffic_Final_Cleaned.csv", index=False)

print("Final cleaned dataset saved successfully!")


Final cleaned dataset saved successfully!
