In [10]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Load the dataset
df = pd.read_csv('traffic_data.csv')

# Step 2: Data Cleaning
# Ensure 'date' column is in datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')
# Convert 'session_duration' to numeric and handle errors (invalid data will be set to NaN)
df['session_duration'] = pd.to_numeric(df['session_duration'], errors='coerce')

# Drop rows with missing session duration or invalid dates
df = df.dropna(subset=['session_duration', 'date'])

# Step 3: Basic Analysis

# 1. Total Traffic (number of page views)
total_traffic = df['page'].count()
print(f"Total Traffic (Page Views): {total_traffic}")

# 2. Total Users (unique user count)
total_users = df['user_id'].nunique()
print(f"Total Users: {total_users}")

# 3. Popular Pages (Top 5 pages by page views)
popular_pages = df['page'].value_counts().head(5)
print(f"Popular Pages:\n{popular_pages}")

# 4. Bounce Rate (percentage of users who bounce)
total_bounces = df['bounce'].sum()
bounce_rate = (total_bounces / total_traffic) * 100
print(f"Bounce Rate: {bounce_rate:.2f}%")

# 5. Average Session Duration
avg_session_duration = df['session_duration'].mean()
print(f"Average Session Duration: {avg_session_duration:.2f} seconds")

# 6. Traffic over Time (daily page views)
daily_traffic = df.groupby(df['date'].dt.date)['page'].count()
print(f"Traffic Over Time (Daily):\n{daily_traffic}")

# Step 4: Data Visualization (Optional)

# Plot: Total Traffic per Day
plt.figure(figsize=(10, 6))
daily_traffic.plot(kind='line', color='blue')
plt.title('Website Traffic Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Page Views')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot: Popular Pages
plt.figure(figsize=(8, 5))
popular_pages.plot(kind='bar', color='green')
plt.title('Top 5 Popular Pages')
plt.xlabel('Page')
plt.ylabel('Number of Views')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Additional Plot: Bounce Rate by Day (Optional)
bounce_by_day = df.groupby(df['date'].dt.date)['bounce'].mean() * 100
plt.figure(figsize=(10, 6))
bounce_by_day.plot(kind='line', color='red')
plt.title('Bounce Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Bounce Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()




KeyError: 'date'