In [1]:
import os
from dotenv import load_dotenv
load_dotenv()  # Load .env file

import pandas as pd
import pymongo

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [2]:
# df should be converted into dict before we push it to mongodb

data = df.to_dict(orient='records')
# data

In [3]:
# Remove _id from data if it exists (to avoid duplicate key errors)
data = [{k: v for k, v in d.items() if k != '_id'} for d in data]

In [4]:
# Inspect the data before uploading
print(f"Number of records: {len(data)}")
print(f"Sample record: {data[0] if data else 'No data'}")
print(f"DataFrame info:")
df.info()
print(f"Null values per column:\n{df.isnull().sum()}")

Number of records: 381109
Sample record: {'id': 1, 'Gender': 'Male', 'Age': 44, 'Driving_License': 1, 'Region_Code': 28.0, 'Previously_Insured': 0, 'Vehicle_Age': '> 2 Years', 'Vehicle_Damage': 'Yes', 'Annual_Premium': 40454.0, 'Policy_Sales_Channel': 26.0, 'Vintage': 217, 'Response': 1}
DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_

In [5]:
DB_NAME = "vehicle-insurance"
COLLECTION_NAME = "insurance"
CONNECTION_URL = os.getenv('MONGODB_URL')  # Load from .env



# above, either remove your credentials or delete the mongoDB resource bofore pushing it to github.

In [6]:
client = pymongo.MongoClient(CONNECTION_URL)
data_base = client[DB_NAME]
collection = data_base[COLLECTION_NAME]

In [7]:
# Test the connection before uploading
try:
    client.admin.command('ping')
    print("Connected to MongoDB successfully!")
except Exception as e:
    print(f"Connection failed: {e}")
    print("Please check your IP whitelist in MongoDB Atlas or your internet connection.")

Connected to MongoDB successfully!


In [8]:
# Uploading data to MongoDB in batches to avoid timeout
batch_size = 10000
total_inserted = 0
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    try:
        rec = collection.insert_many(batch)
        total_inserted += len(rec.inserted_ids)
        print(f"Inserted batch {i//batch_size + 1}: {len(rec.inserted_ids)} documents")
    except Exception as e:
        print(f"Error inserting batch {i//batch_size + 1}: {e}")
        break

print(f"Total documents inserted: {total_inserted}")

Inserted batch 1: 10000 documents
Inserted batch 2: 10000 documents
Inserted batch 3: 10000 documents
Inserted batch 4: 10000 documents
Inserted batch 5: 10000 documents
Inserted batch 6: 10000 documents
Inserted batch 7: 10000 documents
Inserted batch 8: 10000 documents
Inserted batch 9: 10000 documents
Inserted batch 10: 10000 documents
Inserted batch 11: 10000 documents
Inserted batch 12: 10000 documents
Inserted batch 13: 10000 documents
Inserted batch 14: 10000 documents
Inserted batch 15: 10000 documents
Inserted batch 16: 10000 documents
Inserted batch 17: 10000 documents
Inserted batch 18: 10000 documents
Inserted batch 19: 10000 documents
Inserted batch 20: 10000 documents
Inserted batch 21: 10000 documents
Inserted batch 22: 10000 documents
Inserted batch 23: 10000 documents
Inserted batch 24: 10000 documents
Inserted batch 25: 10000 documents
Inserted batch 26: 10000 documents
Inserted batch 27: 10000 documents
Inserted batch 28: 10000 documents
Inserted batch 29: 10000 docu

In [9]:
# Verify the data was inserted
document_count = collection.count_documents({})
print(f"Total documents in collection: {document_count}")

# Sample document from MongoDB
sample_doc = collection.find_one()
print(f"Sample document from MongoDB: {sample_doc}")

Total documents in collection: 1062218
Sample document from MongoDB: {'_id': ObjectId('68caf8f815e70966810a954d'), 'id': 6, 'Gender': 'Female', 'Age': 24, 'Driving_License': 1, 'Region_Code': 33.0, 'Previously_Insured': 0, 'Vehicle_Age': '< 1 Year', 'Vehicle_Damage': 'Yes', 'Annual_Premium': 2630.0, 'Policy_Sales_Channel': 160.0, 'Vintage': 176, 'Response': 0}
