In [17]:
import pandas as pd
import numpy as np

In [18]:
df=pd.read_csv("traffic_modified.csv")

In [19]:
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Timestamp             200 non-null    object 
 1   Location              200 non-null    object 
 2   Vehicle_Count         200 non-null    int64  
 3   Vehicle_Speed         200 non-null    float64
 4   Congestion_Level      200 non-null    object 
 5   Peak_Off_Peak         200 non-null    object 
 6   Target_Vehicle_Count  200 non-null    int64  
 7   Latitude              200 non-null    float64
 8   Longitude             200 non-null    float64
 9   Traffic_Status        200 non-null    object 
 10  Weather               200 non-null    object 
 11  Road_Type             200 non-null    object 
 12  Date                  200 non-null    object 
 13  Hour                  200 non-null    int64  
 14  Day                   200 non-null    object 
 15  Peak_Category         2

In [20]:
df.isnull().sum()

Timestamp               0
Location                0
Vehicle_Count           0
Vehicle_Speed           0
Congestion_Level        0
Peak_Off_Peak           0
Target_Vehicle_Count    0
Latitude                0
Longitude               0
Traffic_Status          0
Weather                 0
Road_Type               0
Date                    0
Hour                    0
Day                     0
Peak_Category           0
Vehicle_Type            0
dtype: int64

In [21]:
# ---------------------------------------------------
# 4) CHECK MISSING VALUES
# ---------------------------------------------------
print("Missing Values:\n", df.isnull().sum())

# Fill missing numeric columns with median
num_cols = df.select_dtypes(include=['int64','float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing categorical columns with mode
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

print("\nMissing values handled!")


Missing Values:
 Timestamp               0
Location                0
Vehicle_Count           0
Vehicle_Speed           0
Congestion_Level        0
Peak_Off_Peak           0
Target_Vehicle_Count    0
Latitude                0
Longitude               0
Traffic_Status          0
Weather                 0
Road_Type               0
Date                    0
Hour                    0
Day                     0
Peak_Category           0
Vehicle_Type            0
dtype: int64

Missing values handled!


In [22]:
# ---------------------------------------------------
# 5) CONVERT TIMESTAMP TO DATETIME
# ---------------------------------------------------
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Drop rows where timestamp could not be converted
df = df.dropna(subset=['Timestamp'])

# Extract useful date parts
df['Date'] = df['Timestamp'].dt.date
df['Hour'] = df['Timestamp'].dt.hour
df['Day'] = df['Timestamp'].dt.day_name()

print("Timestamp conversion completed!")


Timestamp conversion completed!


In [23]:
# ---------------------------------------------------
# 6) REMOVE OUTLIERS IN VEHICLE SPEED
# ---------------------------------------------------
df = df[(df['Vehicle_Speed'] > 10) & (df['Vehicle_Speed'] < 120)]

print("Outliers removed successfully!")


Outliers removed successfully!


In [24]:
# ---------------------------------------------------
# 7) REMOVE DUPLICATES
# ---------------------------------------------------
df.drop_duplicates(subset=['Timestamp', 'Location'], inplace=True)

print("Duplicates removed!")


Duplicates removed!


In [25]:
# ---------------------------------------------------
# 8) CREATE NEW CONGESTION STATUS (BASED ON SPEED)
# ---------------------------------------------------
def congestion_status(speed):
    if speed < 30:
        return 'High'
    elif speed < 50:
        return 'Medium'
    else:
        return 'Low'

df['Congestion_Level'] = df['Vehicle_Speed'].apply(congestion_status)

print("Congestion level added!")


Congestion level added!


In [26]:
# ---------------------------------------------------
# 9) PEAK / OFF-PEAK CLASSIFICATION
# ---------------------------------------------------
def peak_time(h):
    if 7 <= h <= 10 or 17 <= h <= 20:
        return "Peak"
    else:
        return "Off-Peak"

df['Peak_Category'] = df['Hour'].apply(peak_time)

print("Peak category added!")


Peak category added!


In [27]:
# ---------------------------------------------------
# 10) SAVE CLEANED DATASET
# ---------------------------------------------------
df.to_csv("traffic_modified.csv", index=False)

print("Cleaned file saved as:traaffic_modified.csv")



Cleaned file saved as:traaffic_modified.csv


In [28]:
vehicle_list = ["Car", "Bus", "Truck", "Bike"] * 50  # 4*50 = 200
df['Vehicle_Type'] = vehicle_list

In [29]:
import os
print(os.getcwd())

C:\Users\rajna


In [30]:
df.to_csv("traffic_modified.csv", index=False)

In [31]:
df.to_csv(r"C:\Users\rajna\Downloads\traffic_modified.csv", index=False)

In [32]:
vehicle_list = ["Car", "Bus", "Truck", "Bike"] * 50  # 4*50 = 200
df['Vehicle_Type'] = vehicle_list
