In [1]:
import csv
import json

In [2]:
def create_country_code_map(country_file_path):
    """
    Reads the Country-Code.csv file and returns a dictionary
    mapping country codes (as integers) to country names.
    """
    country_map = {}
    try:
        with open(country_file_path, mode='r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    # Assuming column names are 'Country Code' and 'Country'
                    country_code = int(row['country_code'])
                    country_map[country_code] = row['country']
                except ValueError:
                    print(f"Warning: Could not parse country code in {country_file_path}: {row['country_code']}")
                except KeyError:
                    print(f"Warning: 'Country Code' or 'Country' column missing in {country_file_path}.")
                    return None # Indicate critical error
    except FileNotFoundError:
        print(f"Error: Country code file not found at {country_file_path}")
        return None
    return country_map

In [3]:
def transform_data(data_file_path, country_map, output_file_path):
    """
    Reads new_dataset.csv, transforms each row into the desired
    MongoDB document structure, and writes it to an output JSON Lines file.
    """
    if country_map is None:
        print("Error: Country map is not available. Aborting transformation.")
        return

    transformed_count = 0
    try:
        with open(data_file_path, mode='r', encoding='utf-8') as infile, \
             open(output_file_path, mode='w', encoding='utf-8') as outfile:
            
            reader = csv.DictReader(infile)
            
            # Verify expected columns (adjust based on your actual new_dataset.csv headers)
            expected_headers = [
                'restaurant_id', 'restaurant_name', 'country_code', 'city', 'address',
                'locality', 'locality_verbose', 'longitude', 'latitude', 'cuisines',
                'average_cost_for_two', 'currency', 'has_table_booking', 'has_online_delivery',
                'is_delivering_now', 'switch_to_order_menu', 'price_range', 'aggregate_rating',
                'rating_color', 'rating_text', 'votes'
            ]
            missing_headers = [h for h in expected_headers if h not in reader.fieldnames]
            if missing_headers:
                print(f"Warning: The following expected columns are missing from {data_file_path}: {', '.join(missing_headers)}")
                print("Please ensure your CSV column names match. Script will attempt to proceed but may fail or produce incomplete documents.")

            for row in reader:
                try:
                    country_code_val = int(row.get('country_code', 0)) # Default to 0 if missing, handle error below
                    
                    # Handle cuisines: split by comma, strip whitespace
                    cuisines_str = row.get('cuisines', '')
                    cuisines_list = [c.strip() for c in cuisines_str.split(',') if c.strip()] if cuisines_str else []

                    mongo_doc = {
                        # Using original 'Restaurant ID' as a field, MongoDB will generate its own _id
                        "restaurant_id": int(row.get('restaurant_id')),
                        "restaurant_name": row.get('restaurant_name', ''),
                        "address": {
                            "street": row.get('address', ''),
                            "city": row.get('city', ''),
                            "locality": row.get('locality', ''),
                            "locality_verbose": row.get('locality_verbose', ''),
                            "country_code": country_code_val,
                            "country_name": country_map.get(country_code_val, "Unknown Country") # Get country name from map
                        },
                        "location": {
                            "type": "Point",
                            "coordinates": [
                                float(row.get('longitude', 0.0)),
                                float(row.get('latitude', 0.0))
                            ]
                        },
                        "cuisines": cuisines_list,
                        "average_cost_for_two": int(row.get('average_cost_for_two', 0)),
                        "rating_details": {
                            "aggregate_rating": float(row.get('aggregate_rating', 0.0)),
                            "rating_color": row.get('rating_color', ''),
                            "rating_text": row.get('rating_text', ''),
                            "votes": int(row.get('votes', 0)),
                            "price_range": int(row.get('price_range', 0)),
                            "currency": row.get('currency', '')
                        }
                        # The fields 'Has Table booking' and 'Has Online delivery' were noted
                        # as dropped in your Dataset_EDA.ipynb. If they exist in your
                        # new_dataset.csv and you want to include them, add them here.
                        # Example:
                        # "has_table_booking": True if row.get('Has Table booking', 'No').lower() == 'yes' else False,
                        # "has_online_delivery": True if row.get('Has Online delivery', 'No').lower() == 'yes' else False,
                    }
                    
                    # Write the JSON document to the output file, one per line
                    outfile.write(json.dumps(mongo_doc) + '\n')
                    transformed_count += 1
                except ValueError as ve:
                    print(f"Warning: Skipping row due to data conversion error (ValueError): {ve} - Row: {row}")
                except KeyError as ke:
                    print(f"Warning: Skipping row due to missing key (KeyError): {ke} - Row: {row}")
                except Exception as e:
                    print(f"Warning: Skipping row due to unexpected error: {e} - Row: {row}")
            
            print(f"Successfully transformed {transformed_count} documents.")

    except FileNotFoundError:
        print(f"Error: Data file not found at {data_file_path}")
    except Exception as e:
        print(f"An unexpected error occurred during transformation: {e}")


In [4]:
country_csv_file = 'Country-Code.csv' # Ensure this file is in the same directory or provide full path
data_csv_file = 'new_dataset.csv'     # Ensure this file is in the same directory or provide full path
output_jsonl_file = 'restaurants_mongo.jsonl'

print("Starting data migration process...")

country_mapping = create_country_code_map(country_csv_file)

if country_mapping:
    print(f"Country code map created with {len(country_mapping)} entries.")
    transform_data(data_csv_file, country_mapping, output_jsonl_file)
    print(f"Transformation complete. Output written to {output_jsonl_file}")
    print(f"You can now import '{output_jsonl_file}' into MongoDB.")
else:
    print("Could not create country mapping. Please check 'Country-Code.csv'.")

Starting data migration process...
Country code map created with 15 entries.


Successfully transformed 9551 documents.
Transformation complete. Output written to restaurants_mongo.jsonl
You can now import 'restaurants_mongo.jsonl' into MongoDB.


In [5]:
!ls

comparison_file.sql    optimized_queries.sql	setup_to_submit.sql
Country-Code.csv       playground-1.mongodb.js	slow_queries.sql
Dataset_EDA.ipynb      playground-2.mongodb.js	test.R
Mongo_Migration.ipynb  restaurants_mongo.jsonl	zomato.csv
new_dataset.csv        setup.sql		zomato_mongo_pipelines.js
