In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import datetime as dt
import os

In [2]:
# Set styles
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12,6)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data_path = '/content/drive/MyDrive/SwiftTraq/Portfolio/012_RV_Van_Rentals_Analytics/data/Raw/'
files = os.listdir(data_path)
datasets = {file.replace('.csv', ''): pd.read_csv(data_path+file) for file in files if file.endswith('.csv')}

In [5]:
for name, _ in datasets.items():
  print(name)


bookings
campaigns
customers
locations
campers
calendar
maintenance_logs
camper_sensor_logs


In [6]:
customers = datasets['customers']
campaigns = datasets['campaigns']
campers = datasets['campers']
calendar = datasets['calendar']
bookings = datasets['bookings']
camper_sensor_logs = datasets['camper_sensor_logs']
maintenance_logs = datasets['maintenance_logs']

##### ---------------------------
##### 🔍 Basic Inspection
##### ---------------------------

In [7]:

def quick_info(name, df):
  print(f"\n\n-- {name} --")
  print(df.shape)
  print(df.dtypes)
  print(df.head(2))


In [8]:
for name, df in datasets.items():
  quick_info(name, df)



-- bookings --
(50000, 15)
booking_id          object
booking_date        object
customer_id         object
camper_id           object
pickup_date         object
rental_days          int64
dropoff_date        object
pickup_location     object
dropoff_location    object
booking_channel     object
base_price           int64
total_price          int64
booking_status      object
addons              object
season              object
dtype: object
  booking_id booking_date customer_id camper_id pickup_date  rental_days  \
0    BK00000   2024-01-04   CUST00536   RV00972  2024-01-08           12   
1    BK00001   2023-01-20   CUST31935   RV00830  2023-02-26            2   

  dropoff_date pickup_location dropoff_location booking_channel  base_price  \
0   2024-01-20          LOC015           LOC063         Walk-in          74   
1   2023-02-28          LOC008           LOC043          Online         129   

   total_price booking_status     addons  season  
0          888      Completed  Bik

##### ---------------------------
##### 🧹 Data Cleaning
##### ---------------------------

In [9]:
# convert dates
bookings.columns
bookings['booking_date'] = pd.to_datetime(bookings['booking_date'])
bookings['pickup_date'] = pd.to_datetime(bookings['pickup_date'])
bookings['dropoff_date'] = pd.to_datetime(bookings['dropoff_date'])
campaigns['start_date'] = pd.to_datetime(campaigns['start_date'])
campaigns['end_date'] = pd.to_datetime(campaigns['end_date'])
calendar['date'] = pd.to_datetime(calendar['date'])

In [10]:
# checking missing values
missing_summary = pd.DataFrame(
    {
        'table': ['customers', 'campers', 'bookings', 'calendar', 'campaigns'],
        'missing_values': [
            customers.isnull().sum().sum(),
            campers.isnull().sum().sum(),
            bookings.isnull().sum().sum(),
            calendar.isnull().sum().sum(),
            campaigns.isnull().sum().sum()
        ]
    }
)

print(missing_summary)

       table  missing_values
0  customers               0
1    campers               0
2   bookings            8472
3   calendar               0
4  campaigns               0


In [11]:
# Replace NaN addons with -- 'No Add-ons'

bookings['addons'] = bookings['addons'].fillna('No Add-ons')

##### ---------------------------
##### 🔗 Merge Key Datasets
##### ---------------------------

In [12]:
bookings_full = bookings.merge(customers, on='customer_id', how='left')
bookings_full = bookings_full.merge(campers, on='camper_id', how='left')
bookings_full = bookings_full.merge(calendar.rename(columns={'date': 'pickup_date'}), on='pickup_date', how='left')

##### ---------------------------
##### 📊 Initial EDA Questions
##### ---------------------------

In [13]:
# 1. Rental trends over time

bookings['month'] = bookings['pickup_date'].dt.to_period('M').astype(str)
rental_trend = bookings.groupby('month').size()
fig = px.line(rental_trend, x=rental_trend.index, y=rental_trend.values, title='🗓️ Monthly Rental Volume')
fig.update_layout(xaxis_title='Month',
                  yaxis_title='Number of Rentals',
                  template='plotly_white')
fig.show()

In [14]:
# 2. Top countries by bookings
bookings_full.columns
top_countries = bookings_full['country'].value_counts().head().reset_index().sort_values(by='count')
fig = px.bar(top_countries, y='country', x='count', title='🌍 Top 10 Customer Countries')
fig.update_layout(xaxis_title='Number of Bookings',
                  yaxis_title='',
                  template='plotly_white')
fig.show()

In [15]:
# 3. Revenue Distribution
fig = px.histogram(bookings, x='total_price', nbins=50,
                   title='💵 Distribution of Booking Revenue')
fig.update_layout(xaxis_title='Total Price',
                  template='plotly_white')
fig.show()

In [16]:
# Booking Lead Time
bookings['lead_time'] = (bookings['pickup_date'] - bookings['booking_date']).dt.days
fig = px.histogram(bookings, x='lead_time', title='🕛 Lead Time before Pick up', nbins=30)
fig.update_layout(
    xaxis_title='Days in Advance',
    template='plotly_white'
)
fig.show()

In [17]:
# clean service_due --> int
campers['service_due'] = campers['service_due'].replace({'Yes': 1, 'No': 0}).astype(int)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [18]:
# Service due rate
service_counts = campers.groupby('type')['service_due'].mean().sort_values(ascending=False).head(10)
fig = px.bar(service_counts, x=service_counts.values, y=service_counts.index,
             title='🔧 Avg. Service Due Rate by Camper Model/Type')
fig.update_layout(xaxis_title='% Needing Service',
                  yaxis_title='',
                  template='plotly_white')
fig.show()

In [19]:
# save merged data for reuse
bookings_full.to_csv('/content/drive/MyDrive/SwiftTraq/Portfolio/012_RV_Van_Rentals_Analytics/data/bookings_full.csv', index=False)

print("\n ✅ EDA Complete -- Data cleaned, merged and ready for next phase")


 ✅ EDA Complete -- Data cleaned, merged and ready for next phase
