In [23]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime
import zipfile

def log(message, log_file):
    """Log messages to a log file with a timestamp."""
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file, 'a') as f:
        f.write(f"[{timestamp}] {message}\n")

# def unzip_file(zip_path, extract_to):
#     """Unzip the file to the specified directory."""
#     if not os.path.exists(extract_to):
#         os.makedirs(extract_to)  # Create the folder if it doesn't exist
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         zip_ref.extractall(extract_to)

def extract_csv(file_path):
    """Extract data from a CSV file."""
    return pd.read_csv(file_path)

def extract_json(file_path):
    """Extract data from a JSON file."""
    return pd.read_json(file_path)

def extract_xml(file_path):
    """Extract data from an XML file."""
    tree = ET.parse(file_path)
    root = tree.getroot()
    all_data = []
    for child in root:
        data = {element.tag: element.text for element in child}
        all_data.append(data)
    return pd.DataFrame(all_data)

def extract_data(data_folder):
    """Extract data from multiple file formats."""
    extracted_data = pd.DataFrame()
    for file in glob.glob(f"{data_folder}/*"):
        if file.endswith('.csv'):
            data = extract_csv(file)
        elif file.endswith('.json'):
            data = extract_json(file)
        elif file.endswith('.xml'):
            data = extract_xml(file)
        else:
            continue
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    return extracted_data

def transform_data(data):
    """Transform the data (convert heights and weights)."""
    data['Height'] = data['Height'].astype(float) * 0.0254  # inches to meters
    data['Weight'] = data['Weight'].astype(float) * 0.453592  # pounds to kilograms
    return data

def load_data(data, output_file):
    """Save the transformed data to a CSV file."""
    data.to_csv(output_file, index=False)

def main():
    # Paths
    #zip_file = './source.zip'
    data_folder = r'C:\Users\Ram\Desktop\ME36'
    log_file = r'C:\Users\Ram\Desktop\ME36\log_file.txt'
    output_file = r'C:\Users\Ram\Desktop\ME36/transformed_data3.csv'

    # Clear log file
    if os.path.exists(log_file):
        os.remove(log_file)

    # # Unzipping step
    # log("Unzipping the file.", log_file)
    # unzip_file(zip_file, data_folder)

    log("ETL process started.", log_file)
    
    try:
        # Extraction
        log("Starting data extraction.", log_file)
        extracted_data = extract_data(data_folder)
        log("Data extraction completed.", log_file)
        print(extracted_data)

        # Transformation
        log("Starting data transformation.", log_file)
        transformed_data = transform_data(extracted_data)
        print(transformed_data)
        log("Data transformation completed.", log_file)

        # Loading
        log("Starting data loading.", log_file)
        load_data(transformed_data, output_file)
        log("Data loading completed.", log_file)

        log("ETL process completed successfully.", log_file)

    except Exception as e:
        log(f"ETL process failed: {e}", log_file)

if __name__ == "__main__":
    main()



In [29]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime
import zipfile

def log(message, log_file):
    """Log messages to a log file with a timestamp."""
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file, 'a') as f:
        f.write(f"[{timestamp}] {message}\n")



def extract_csv(file_path):
    """Extract data from a CSV file."""
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading CSV file {file_path}: {e}")
        return pd.DataFrame()

def extract_json(file_path):
    """Extract data from a JSON file."""
    try:
        return pd.read_json(file_path, lines=True)  # Use lines=True for JSON lines format
    except ValueError as e:
        print(f"Error reading JSON file {file_path}: {e}")
        return pd.DataFrame()

def extract_xml(file_path):
    """Extract data from an XML file."""
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        all_data = []
        for child in root:
            data = {element.tag: element.text for element in child}
            all_data.append(data)
        return pd.DataFrame(all_data)
    except ET.ParseError as e:
        print(f"Error reading XML file {file_path}: {e}")
        return pd.DataFrame()

def extract_data(data_folder):
    """Extract data from multiple file formats."""
    extracted_data = pd.DataFrame()
    for file in glob.glob(f"{data_folder}/*"):
        print(f"Processing file: {file}")  # Print each file being processed
        try:
            if file.endswith('.csv'):
                data = extract_csv(file)
            elif file.endswith('.json'):
                data = extract_json(file)
            elif file.endswith('.xml'):
                data = extract_xml(file)
            else:
                continue
            extracted_data = pd.concat([extracted_data, data], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file}: {e}")
    
    # Normalize column names to lowercase for consistency
    extracted_data.columns = [col.lower() for col in extracted_data.columns]

        # Remove duplicates
    extracted_data = extracted_data.drop_duplicates()

    return extracted_data


def transform_data(data):
    """Transform the data (convert heights and weights)."""
    try:
        data['height'] = data['height'].astype(float) * 0.0254  # inches to meters
        data['weight'] = data['weight'].astype(float) * 0.453592  # pounds to kilograms
    except KeyError as e:
        print(f"Missing column during transformation: {e}")
    except ValueError as e:
        print(f"Data conversion error during transformation: {e}")
    return data

def load_data(data, output_file):
    """Save the transformed data to a CSV file."""
    try:
        data.to_csv(output_file, index=False)
    except Exception as e:
        print(f"Error saving data to {output_file}: {e}")

def main():
    # Paths
   
    data_folder = r'C:\Users\Ram\Desktop\ME36'
    log_file = r'C:\Users\Ram\Desktop\ME36\log_file.txt'
    output_file = r'C:\Users\Ram\Desktop\ME36\transformed_data3.csv'

    # Clear log file
    if os.path.exists(log_file):
        os.remove(log_file)


    log("ETL process started.", log_file)
    
    try:
        # Extraction
        log("Starting data extraction.", log_file)
        extracted_data = extract_data(data_folder)
        log("Data extraction completed.", log_file)
							 

        # Print extracted data for debugging
        print("Extracted Data:")
        print(extracted_data.head())

        # Transformation
        log("Starting data transformation.", log_file)
        transformed_data = transform_data(extracted_data)
							   
        log("Data transformation completed.", log_file)

        # Print transformed data for debugging
        print("Transformed Data:")
        print(transformed_data.head())

        # Loading
        log("Starting data loading.", log_file)
        load_data(transformed_data, output_file)
        log("Data loading completed.", log_file)

        log("ETL process completed successfully.", log_file)

    except Exception as e:
        log(f"ETL process failed: {e}", log_file)
        print(f"ETL process failed: {e}")

if __name__ == "__main__":
    main()



Processing file: C:\Users\Ram\Desktop\ME36\helloworld.ipynb
Processing file: C:\Users\Ram\Desktop\ME36\log_file.txt
Processing file: C:\Users\Ram\Desktop\ME36\source1.csv
Processing file: C:\Users\Ram\Desktop\ME36\source1.json
Processing file: C:\Users\Ram\Desktop\ME36\source1.xml
Processing file: C:\Users\Ram\Desktop\ME36\source2.csv
Processing file: C:\Users\Ram\Desktop\ME36\source2.json
Processing file: C:\Users\Ram\Desktop\ME36\source2.xml
Processing file: C:\Users\Ram\Desktop\ME36\source3.csv
Processing file: C:\Users\Ram\Desktop\ME36\source3.json
Processing file: C:\Users\Ram\Desktop\ME36\source3.xml
Processing file: C:\Users\Ram\Desktop\ME36\test.ipynb
Processing file: C:\Users\Ram\Desktop\ME36\transformed_data2.csv
Processing file: C:\Users\Ram\Desktop\ME36\transformed_data3.csv
Extracted Data:
    name height  weight
0   alex  65.78  112.99
1   ajay  71.52  136.49
2  alice   69.4  153.03
3   ravi  68.22  142.34
4    joe  67.79   144.3
Transformed Data:
    name    height     w

In [None]:
# Task 1: extracting the data and save it in document format
import re

def extract_emails_and_dates(log_file_path):
    #for extracting the specific email and date
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    date_regex = r'\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\b'
    extracted_data = []

    with open(log_file_path, 'r') as file:
        for line in file:
            emails = re.findall(email_regex, line) #cheking exail line by line
            dates = re.findall(date_regex, line) #checking date line by line
            for email in emails:
                for date in dates:
                    extracted_data.append({'email': email, 'date': date})

    return extracted_data

# Test the function
log_file_path = r'C:\Users\Ram\Downloads\mbox.txt'  # Replace with your log file path
data = extract_emails_and_dates(log_file_path)
print(data)

[{'email': 'hu2@iupui.edu', 'date': '2007-12-20 15:25:38'}, {'email': 'hu2@iupui.edu', 'date': '2007-12-20 21:26:28'}, {'email': 'david.horwitz@uct.ac.za', 'date': '2008-01-04 13:05:51'}, {'email': 'josrodri@iupui.edu', 'date': '2007-12-28 23:44:24'}, {'email': 'josrodri@iupui.edu', 'date': '2007-12-12 21:40:33'}, {'email': 'wagnermr@iupui.edu', 'date': '2007-12-17 17:11:08'}, {'email': 'wagnermr@iupui.edu', 'date': '2007-09-12 16:17:59'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-12 15:53:46'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-17 12:16:42'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-15 01:11:33'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-11 14:26:27'}, {'email': 'cwen@iupui.edu', 'date': '2007-11-29 14:35:46'}, {'email': 'cwen@iupui.edu', 'date': '2007-10-31 13:15:28'}, {'email': 'cwen@iupui.edu', 'date': '2007-10-29 14:30:26'}, {'email': 'wagnermr@iupui.edu', 'date': '2007-12-17 15:20:23'}, {'email': 'rjlowe@iupui.edu', 'date': '2007-12-17 10:53:09'}, {'email': 

In [3]:
from datetime import datetime

def transform_data(extracted_data):
    transformed_data = []
    for record in extracted_data:
        try:
            date = datetime.strptime(record['date'], '%Y-%m-%d %H:%M:%S')
            transformed_data.append({
                'email': record['email'],
                'date': date.strftime('%Y-%m-%d %H:%M:%S')
            })
        except ValueError:
            continue  # Skip invalid dates
    return transformed_data

# Test the function
transformed_data = transform_data(data)
print(transformed_data)

[{'email': 'hu2@iupui.edu', 'date': '2007-12-20 15:25:38'}, {'email': 'hu2@iupui.edu', 'date': '2007-12-20 21:26:28'}, {'email': 'david.horwitz@uct.ac.za', 'date': '2008-01-04 13:05:51'}, {'email': 'josrodri@iupui.edu', 'date': '2007-12-28 23:44:24'}, {'email': 'josrodri@iupui.edu', 'date': '2007-12-12 21:40:33'}, {'email': 'wagnermr@iupui.edu', 'date': '2007-12-17 17:11:08'}, {'email': 'wagnermr@iupui.edu', 'date': '2007-09-12 16:17:59'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-12 15:53:46'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-17 12:16:42'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-15 01:11:33'}, {'email': 'cwen@iupui.edu', 'date': '2007-12-11 14:26:27'}, {'email': 'cwen@iupui.edu', 'date': '2007-11-29 14:35:46'}, {'email': 'cwen@iupui.edu', 'date': '2007-10-31 13:15:28'}, {'email': 'cwen@iupui.edu', 'date': '2007-10-29 14:30:26'}, {'email': 'wagnermr@iupui.edu', 'date': '2007-12-17 15:20:23'}, {'email': 'rjlowe@iupui.edu', 'date': '2007-12-17 10:53:09'}, {'email': 

In [10]:
!python -m pip install "pymongo[srv]"

Collecting pymongo[srv]
  Downloading pymongo-4.10.1-cp312-cp312-win_amd64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
Downloading pymongo-4.10.1-cp312-cp312-win_amd64.whl (926 kB)
   ---------------------------------------- 0.0/926.7 kB ? eta -:--:--
   ---------------------------------------- 926.7/926.7 kB 4.7 MB/s eta 0:00:00
Installing collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://ram:2000@cluster0.5dmxi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [15]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://ram:2000@cluster0.5dmxi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
#mongodb+srv://ram:<db_password>@cluster0.5dmxi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0
def save_to_mongodb(data, db_name='logs', collection_name='user_history'):
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    collection.insert_many(data)
    print(f"Inserted {len(data)} records into MongoDB collection '{collection_name}'.")

# Test the function
save_to_mongodb(transformed_data)


Inserted 343 records into MongoDB collection 'user_history'.


In [16]:
!pip install mysql-connector-python




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:

from pymongo import MongoClient
import mysql.connector

def fetch_from_mongodb(mongo_uri='mongodb+srv://ram:2000@cluster0.5dmxi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0', db_name='logs', collection_name='user_history'):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]
    return list(collection.find({}, {'_id': 0}))

def save_to_mysql(data, table_name='user_history'):
    conn = mysql.connector.connect(
        host="localhost",
        port=3306,  # Default MySQL port
        user="root",  
        password="",  
        database="test" 
    )
    cursor = conn.cursor()
    
    # Create table
    cursor.execute(f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            id INT AUTO_INCREMENT PRIMARY KEY,
            email VARCHAR(255) NOT NULL,
            date DATETIME NOT NULL
        );
    """)

    # Insert data
    cursor.executemany(f"""
        INSERT INTO {table_name} (email, date) VALUES (%s, %s);
    """, [(record['email'], record['date']) for record in data])

    conn.commit()
    conn.close()
    print(f"Inserted {len(data)} records into MySQL table '{table_name}'.")

# Test the function
mongodb_data = fetch_from_mongodb()
save_to_mysql(mongodb_data)


Inserted 343 records into MySQL table 'user_history'.


In [None]:
query = "SELECT DISTINCT email FROM user_history;"
cursor.execute(query)
unique_emails = cursor.fetchall()
print(unique_emails)
