In [None]:
!python -m pip install "pymongo[srv]"

In [26]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://ram:2000@cluster0.5dmxi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0" #mongo_connection_url

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [37]:
import re
from datetime import datetime
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# Task 1: Extracting the data (emails and dates)
def extract_emails_and_dates(log_file_path):
    email_regex = r'^From\s+([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})'
    date_regex = r'^Date:\s+(.+)'  # Matches "Date: Sat, 5 Jan 2008 09:12:18 -0500"
    extracted_data = []

    with open(log_file_path, 'r', encoding='utf-8') as file:
        email = None
        date = None

        for line in file:
            try:
                email_match = re.search(email_regex, line)
                date_match = re.search(date_regex, line)

                if email_match:
                    email = email_match.group(1)  # Extract email

                if date_match:
                    date = date_match.group(1)  # Extract date

                if email and date:
                    extracted_data.append({'email': email, 'date': date})
                    email, date = None, None  # Reset for the next entry
            
            except Exception as e:
                print(f"Error processing line: {line.strip()} - {e}")

    return extracted_data

# Task 2: Transforming the data (standardizing date format)
def transform_data(extracted_data):
    transformed_data = []
    
    for record in extracted_data:
        try:
            date_str = record['date']
            
            # Remove any text inside parentheses (optional extra info)
            date_str = re.sub(r'\(.*?\)', '', date_str).strip()

            # List of possible date formats to try
            date_formats = [
                "%Y-%m-%d %H:%M:%S %z",  # 2008-01-05 09:12:07 -0500
                "%a, %d %b %Y %H:%M:%S %z"  # Sat, 5 Jan 2008 09:12:18 -0500
            ]
            
            # Try parsing with different formats
            for date_format in date_formats:
                try:
                    date = datetime.strptime(date_str, date_format)
                    transformed_data.append({
                        'email': record['email'],
                        'date': date.strftime('%Y-%m-%d %H:%M:%S')  # Standardized format
                    })
                    break  # Stop after successful parsing
                except ValueError:
                    continue  # Try the next format
            
            else:
                # If no format matched, print an error
                print(f"Skipping invalid date: {record['date']} - No valid format found")
        
        except Exception as e:
            print(f"Skipping invalid date: {record['date']} - Error: {e}")
            continue  # Skip invalid date entries
    
    return transformed_data

# Task 3: Save the data to MongoDB
uri = "mongodb+srv://ram:2000@cluster0.5dmxi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster1"  # MongoDB connection URI
# def save_to_mongodb(data, db_name='logs', collection_name='user_history'):
#     client = MongoClient(uri, server_api=ServerApi('1'))
#     db = client[db_name]
#     collection = db[collection_name]
#     collection.insert_many(data)
#     print(f"Inserted {len(data)} records into MongoDB collection '{collection_name}'.")
def save_to_mongodb(data, db_name='test_db', collection_name='user_history'):
    try:
        client = MongoClient(uri, server_api=ServerApi('1'))
        db = client[db_name]  # MongoDB automatically creates the database if it doesn't exist

        # Explicitly create the collection if it doesn't exist
        collection = db.get_collection(collection_name)

        # Now insert the data
        if data:
            result = collection.insert_many(data)
            print(f"Inserted {len(result.inserted_ids)} records into MongoDB collection '{collection_name}'.")
        else:
            print(f"No data to insert into MongoDB collection '{collection_name}'.")

    except Exception as e:
        print(f"Error during MongoDB connection or data insertion: {e}")


# Main execution
log_file_path = r'C:\Users\Ram\Downloads\mbox.txt'  # Replace with your actual log file path

# Step 1: Extract emails and dates from the log file
extracted_data = extract_emails_and_dates(log_file_path)
print(f"Extracted data: {extracted_data}")

# Step 2: Transform the extracted data (standardizing the date format)
transformed_data = transform_data(extracted_data)
print(f"Transformed data: {transformed_data}")

# Step 3: Save the transformed data to MongoDB
save_to_mongodb(transformed_data)


Extracted data: [{'email': 'stephen.marquard@uct.ac.za', 'date': 'Sat, 5 Jan 2008 09:12:18 -0500'}, {'email': 'louis@media.berkeley.edu', 'date': '2008-01-05 09:12:07 -0500 (Sat, 05 Jan 2008)'}, {'email': 'zqian@umich.edu', 'date': '2008-01-04 18:08:50 -0500 (Fri, 04 Jan 2008)'}, {'email': 'rjlowe@iupui.edu', 'date': '2008-01-04 16:09:01 -0500 (Fri, 04 Jan 2008)'}, {'email': 'zqian@umich.edu', 'date': '2008-01-04 15:44:39 -0500 (Fri, 04 Jan 2008)'}, {'email': 'rjlowe@iupui.edu', 'date': '2008-01-04 15:01:37 -0500 (Fri, 04 Jan 2008)'}, {'email': 'cwen@iupui.edu', 'date': '2008-01-04 14:48:37 -0500 (Fri, 04 Jan 2008)'}, {'email': 'cwen@iupui.edu', 'date': '2008-01-04 11:35:25 -0500 (Fri, 04 Jan 2008)'}, {'email': 'gsilver@umich.edu', 'date': '2008-01-04 11:33:05 -0500 (Fri, 04 Jan 2008)'}, {'email': 'gsilver@umich.edu', 'date': '2008-01-04 11:11:00 -0500 (Fri, 04 Jan 2008)'}, {'email': 'zqian@umich.edu', 'date': '2008-01-04 11:10:04 -0500 (Fri, 04 Jan 2008)'}, {'email': 'gsilver@umich.ed

In [None]:
#task 4

from pymongo import MongoClient
import mysql.connector

def fetch_from_mongodb(mongo_uri='mongodb+srv://ram:2000@cluster0.5dmxi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0', db_name='logs', collection_name='user_history'):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]
    return list(collection.find({}, {'_id': 0}))

def save_to_mysql(data, table_name='user_history'):
    conn = mysql.connector.connect(
        host="localhost",
        port=3306,  # Default MySQL port
        user="root",  
        password="",  
        database="test" 
    )
    cursor = conn.cursor()
    
    # Create table
    cursor.execute(f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            id INT AUTO_INCREMENT PRIMARY KEY,
            email VARCHAR(255) NOT NULL,
            date DATETIME NOT NULL
        );
    """)

    # Insert data
    cursor.executemany(f"""
        INSERT INTO {table_name} (email, date) VALUES (%s, %s);
    """, [(record['email'], record['date']) for record in data])

    conn.commit()
    conn.close()
    print(f"Inserted {len(data)} records into MySQL table '{table_name}'.")

# Test the function
mongodb_data = fetch_from_mongodb()
save_to_mysql(mongodb_data)
