In [3]:
import pandas as pd
from pymongo import MongoClient

def rename_columns(df) -> None:
    """Renames specific columns for clarity."""
    df.rename(columns={
        'Agent_Age': 'agent_age',
        'Agent_Rating': 'agent_rating',
        'Store_Latitude': 'store_latitude',
        'Store_Longitude': 'store_longitude',
        'Drop_Latitude': 'drop_latitude',
        'Drop_Longitude': 'drop_longitude',
        'Order_Date': 'order_date',
        'Order_Time': 'order_time',
        'Pickup_Time': 'pickup_time',
        'Weather': 'weather',
        'Traffic': 'traffic',
        'Vehicle': 'vehicle',
        'Area': 'area',
        'Delivery_Time': 'delivery_time',
        'Category': 'category'
    }, inplace=True)

def DatabaseConnection() -> object:
    """Establish a MongoDB connection."""
    client = MongoClient('mongodb://localhost:27017/')
    db = client['amazon_delivery']
    return db

def main() -> None:
    """Main function for data cleaning and saving to MongoDB."""
    # Load data
    df = pd.read_csv("amazon_delivery.csv")

    # Rename columns
    rename_columns(df)

    # Check for null values
    print("Null values per column:")
    print(df.isnull().sum())

    # Drop rows with missing values
    df.dropna(inplace=True)

    # Check for duplicates and remove them
    print("Number of duplicate rows:", df.duplicated().sum())
    df.drop_duplicates(inplace=True)

    # Standardize column names to lowercase
    df.columns = [col.lower() for col in df.columns]

    # Display summary and initial rows
    print("\nDataFrame Summary:")
    print(df.describe())

    print("\nFirst few rows:")
    print(df.head())

    print("\nDataFrame Info:")
    print(df.info())

    # Save cleaned data to a new CSV file
    df.to_csv("cleaned_amazon_delivery.csv", index=False)

    # Save data to MongoDB
    db = DatabaseConnection()
    collection = db['delivery_stats']
    data = df.to_dict(orient='records')
    collection.insert_many(data)
    print("Data saved to MongoDB.")

if __name__ == "__main__":
    main()


Null values per column:
Order_ID            0
agent_age           0
agent_rating       54
store_latitude      0
store_longitude     0
drop_latitude       0
drop_longitude      0
order_date          0
order_time          0
pickup_time         0
weather            91
traffic             0
vehicle             0
area                0
delivery_time       0
category            0
dtype: int64
Number of duplicate rows: 0

DataFrame Summary:
          agent_age  agent_rating  store_latitude  store_longitude  \
count  43594.000000  43594.000000    43594.000000     43594.000000   
mean      29.555719      4.635287       17.244769        70.768898   
std        5.760689      0.313827        7.690005        21.128773   
min       20.000000      2.500000      -30.902872         0.000000   
25%       25.000000      4.500000       12.933298        73.170283   
50%       30.000000      4.700000       18.554382        75.898497   
75%       35.000000      4.900000       22.732225        78.045359   
max