In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def run_road_traffic_analysis():
    """
    Performs a mock road traffic data analysis and prints results to the console.
    """
    print("--- Road Traffic Data Analysis Project ---")

    # --- 1. Generate Sample Data ---
    print("\n--- 1. Generating Sample Data ---")
    # Seed for reproducibility
    np.random.seed(42)
    
    # Create a more varied time range for timestamps
    start_time = datetime(2023, 10, 1, 0, 0, 0)
    timestamps_list = []
    for i in range(200): # Increased sample size for more interesting aggregates
        # Mix of weekdays and weekends, different hours
        days_offset = i // 24  # Change day roughly every 24 records
        hour_of_day = i % 24
        minute_offset = np.random.randint(0, 60)
        second_offset = np.random.randint(0, 60)
        current_time = start_time + timedelta(days=days_offset, hours=hour_of_day, minutes=minute_offset, seconds=second_offset)
        timestamps_list.append(current_time.strftime('%Y-%m-%d %H:%M:%S'))

    data = {
        'timestamp_str': timestamps_list,
        'road_segment_id': np.random.choice(['North_Main_St', 'South_Highway_101', 'East_Avenue_X', 'West_Bridge_Rd'], size=200),
        'traffic_volume': np.random.randint(50, 500, 200), # Vehicles per interval
        'average_speed_kmh': np.random.uniform(15, 90, 200),
        'weather_condition': np.random.choice(['Clear', 'Rainy', 'Cloudy', 'Foggy'], size=200, p=[0.6, 0.15, 0.2, 0.05])
    }
    df = pd.DataFrame(data)

    # Introduce some missing values for demonstration
    for col in ['traffic_volume', 'average_speed_kmh']:
        idx_to_nan = np.random.choice(df.index, size=10, replace=False)
        df.loc[idx_to_nan, col] = np.nan
    
    print(f"Sample DataFrame created with {len(df)} rows and {len(df.columns)} columns.")
    print("It includes 'timestamp_str', 'road_segment_id', 'traffic_volume', 'average_speed_kmh', 'weather_condition'.")
    print("Some NaN values have been intentionally introduced for demonstration.\n")

    # --- 2. Initial Data Inspection ---
    print("\n--- 2. Initial Data Inspection ---")
    print("\nDataFrame Head (First 5 rows):")
    print(df.head())
    
    print("\nDataFrame Tail (Last 5 rows):")
    print(df.tail())

    print("\nDataFrame Info (dtypes, non-null counts, memory usage):")
    df.info(verbose=True) # verbose=True gives more info, good for "full output"
    
    print("\nMissing Values per Column (Before Cleaning):")
    print(df.isnull().sum())
    print("\n")

    # --- 3. Data Cleaning & Preprocessing ---
    print("\n--- 3. Data Cleaning & Preprocessing ---")

    # Convert 'timestamp_str' to datetime objects
    df['timestamp'] = pd.to_datetime(df['timestamp_str'])
    print("Converted 'timestamp_str' to datetime 'timestamp' column.")

    # Extract date/time features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week_num'] = df['timestamp'].dt.dayofweek # Monday=0, Sunday=6
    df['day_of_week_name'] = df['timestamp'].dt.day_name()
    df['month'] = df['timestamp'].dt.month
    df['is_weekend'] = df['day_of_week_num'].isin([5, 6]) # Saturday=5, Sunday=6
    print("Extracted features: 'hour', 'day_of_week_num', 'day_of_week_name', 'month', 'is_weekend'.")

    # Handle Missing Values
    # For 'traffic_volume', fill with median
    median_traffic_volume = df['traffic_volume'].median()
    df['traffic_volume'].fillna(median_traffic_volume, inplace=True)
    print(f"\nFilled NaN in 'traffic_volume' with column median: {median_traffic_volume:.2f}")

    # For 'average_speed_kmh', fill with mean
    mean_avg_speed = df['average_speed_kmh'].mean()
    df['average_speed_kmh'].fillna(mean_avg_speed, inplace=True)
    print(f"Filled NaN in 'average_speed_kmh' with column mean: {mean_avg_speed:.2f}")

    print("\nMissing Values per Column (After Cleaning):")
    print(df.isnull().sum())

    # Drop original string timestamp column as it's no longer needed
    df.drop('timestamp_str', axis=1, inplace=True)
    print("\nDropped original 'timestamp_str' column.")

    print("\nDataFrame Info After Preprocessing:")
    df.info(verbose=True)
    print("\nUpdated DataFrame Head (First 5 rows with new features):")
    print(df[['timestamp', 'hour', 'day_of_week_name', 'traffic_volume', 'average_speed_kmh', 'is_weekend']].head())
    print("\n")

    # --- 4. Exploratory Data Analysis (EDA) & Output ---
    print("\n--- 4. Exploratory Data Analysis (EDA) ---")

    print("\nDescriptive Statistics for Numerical Columns:")
    # Formatting for better readability
    print(df.describe().to_string(float_format="%.2f"))
    
    print("\nDescriptive Statistics for Categorical Columns (and boolean):")
    print(df.describe(include=['object', 'bool']).to_string())
    
    print("\nValue Counts for 'road_segment_id':")
    print(df['road_segment_id'].value_counts().to_string())
    
    print("\nValue Counts for 'weather_condition':")
    print(df['weather_condition'].value_counts().to_string())
    
    print("\nValue Counts for 'is_weekend':")
    print(df['is_weekend'].value_counts().to_string())
    print("\n")

    # Average traffic volume and speed by hour of day
    print("--- Analysis by Hour of Day ---")
    hourly_analysis = df.groupby('hour').agg(
        avg_traffic_volume=('traffic_volume', 'mean'),
        median_traffic_volume=('traffic_volume', 'median'),
        avg_speed_kmh=('average_speed_kmh', 'mean'),
        num_records=('hour', 'count')
    ).sort_index()
    print("Average & Median Traffic Volume, Average Speed, and Record Count by Hour:")
    print(hourly_analysis.to_string(float_format="%.2f"))
    print("(Note: For visualization, this data could be used for line plots or bar charts.)\n")

    # Identify peak hours (e.g., top 3 hours with highest average traffic volume)
    print("Top 3 Peak Traffic Hours (based on average volume):")
    peak_hours = hourly_analysis.sort_values(by='avg_traffic_volume', ascending=False).head(3)
    print(peak_hours.to_string(float_format="%.2f"))
    print("\n")

    # Average traffic volume by day of the week
    print("--- Analysis by Day of the Week ---")
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    daily_analysis = df.groupby('day_of_week_name').agg(
        avg_traffic_volume=('traffic_volume', 'mean'),
        avg_speed_kmh=('average_speed_kmh', 'mean')
    ).reindex(days_order)
    # Remove days not present in data if any, after reindexing (should not happen with this sample data)
    daily_analysis.dropna(inplace=True)
    print("Average Traffic Volume and Speed by Day of the Week:")
    print(daily_analysis.to_string(float_format="%.2f"))
    print("(Note: For visualization, this data could be used for a bar chart.)\n")

    # Analysis by road segment
    print("--- Analysis by Road Segment ---")
    segment_analysis = df.groupby('road_segment_id').agg(
        avg_traffic_volume=('traffic_volume', 'mean'),
        median_traffic_volume=('traffic_volume', 'median'),
        avg_speed_kmh=('average_speed_kmh', 'mean'),
        min_speed_kmh=('average_speed_kmh', 'min'),
        max_speed_kmh=('average_speed_kmh', 'max')
    ).sort_values(by='avg_traffic_volume', ascending=False)
    print("Traffic and Speed Statistics by Road Segment:")
    print(segment_analysis.to_string(float_format="%.2f"))
    print("\n")

    # Analysis by weather condition
    print("--- Analysis by Weather Condition ---")
    weather_analysis = df.groupby('weather_condition').agg(
        avg_traffic_volume=('traffic_volume', 'mean'),
        avg_speed_kmh=('average_speed_kmh', 'mean'),
        num_occurrences=('weather_condition', 'count')
    ).sort_values(by='avg_traffic_volume', ascending=False)
    print("Traffic and Speed Statistics by Weather Condition:")
    print(weather_analysis.to_string(float_format="%.2f"))
    print("\n")

    # Correlation matrix for numerical features
    print("--- Correlation Analysis ---")
    numerical_cols = df.select_dtypes(include=np.number).columns
    # Exclude day_of_week_num and month if they are not intended for direct correlation in this context,
    # or ensure they are treated as potentially cyclical if relevant for advanced analysis.
    # For this example, we'll include them as simple numerical features.
    correlation_matrix = df[numerical_cols].corr()
    print("Correlation Matrix for Numerical Features (including extracted date/time parts):")
    print(correlation_matrix.to_string(float_format="%.2f"))
    print("(Note: This matrix can be visualized as a heatmap to easily spot strong correlations.)\n")
    
    # Example: Relationship between traffic volume and average speed
    print("Correlation between Traffic Volume and Average Speed: {:.2f}".format(
        df['traffic_volume'].corr(df['average_speed_kmh'])
    ))
    print("(A negative correlation is often expected: higher volume, lower speed.)")

    print("\n--- End of Analysis ---")
    print("\nTo visualize these results, libraries like Matplotlib, Seaborn, or Plotly would typically be used.")
    print("For example, hourly_analysis could be plotted as a line graph,")
    print("and daily_analysis or segment_analysis as bar charts.")
    print("The correlation matrix is often visualized as a heatmap.")

if __name__ == '__main__':
    # Configure pandas to display more rows/columns if needed for console output
    # This is useful for seeing more of the DataFrame outputs without truncation.
    pd.set_option('display.max_rows', 200)        # Show up to 200 rows
    pd.set_option('display.max_columns', 20)      # Show up to 20 columns
    pd.set_option('display.width', 120)           # Adjust console width for printing
    pd.set_option('display.float_format', '{:.2f}'.format) # Format floats to 2 decimal places

    run_road_traffic_analysis()

--- Road Traffic Data Analysis Project ---

--- 1. Generating Sample Data ---
Sample DataFrame created with 200 rows and 5 columns.
It includes 'timestamp_str', 'road_segment_id', 'traffic_volume', 'average_speed_kmh', 'weather_condition'.
Some NaN values have been intentionally introduced for demonstration.


--- 2. Initial Data Inspection ---

DataFrame Head (First 5 rows):
         timestamp_str road_segment_id  traffic_volume  average_speed_kmh weather_condition
0  2023-10-01 00:38:51   East_Avenue_X          339.00                NaN             Clear
1  2023-10-01 01:28:14   East_Avenue_X          145.00              67.57             Clear
2  2023-10-01 02:42:07   North_Main_St          175.00              20.46             Clear
3  2023-10-01 03:20:38  West_Bridge_Rd          167.00              76.64             Clear
4  2023-10-01 04:57:18   North_Main_St           97.00              67.97             Clear

DataFrame Tail (Last 5 rows):
           timestamp_str    road_segme