In [1]:
import pandas as pd
import sqlite3

In [2]:
# Define the path to your CSV file
csv_file_path = './providers.csv'
sqlite_db_path = '../db/providers.db'

In [3]:
# Define constants for CSV reading
CSV_HEADER_OFFSET = 2  # skip first 2 rows
cursor = 0  # Adjust as needed
limit = None  # Adjust as needed

In [4]:
df = pd.read_csv(csv_file_path, skiprows=CSV_HEADER_OFFSET + cursor, nrows=limit, dtype=str)

In [5]:
def parse_int(value):
    try:
        return int(''.join(filter(str.isdigit, str(value)))) if pd.notnull(value) else None
    except ValueError:
        return None

def parse_float(value):
    try:
        cleaned_value = ''.join(filter(lambda x: x.isdigit() or x == '.', str(value)))
        # Handle cases with multiple periods (e.g., '3.53.9')
        parts = cleaned_value.split('.')
        if len(parts) > 2:
            cleaned_value = parts[0] + '.' + ''.join(parts[1:])
        return float(cleaned_value) if pd.notnull(value) else None
    except ValueError:
        return None

def parse_bool(value):
    return value.lower() == 'true' if pd.notnull(value) else None

In [6]:
def transform_row(row):
    if pd.isnull(row.iloc[1]):
        return None
    return {
        'providerName': row.iloc[1],
        'productName': row.iloc[2],
        'country': row.iloc[3],
        'location': row.iloc[4],
        'cpuCores': parse_int(row.iloc[6]),
        'cpuThreads': parse_int(row.iloc[8]),
        'cpuGHZ': parse_float(row.iloc[9]),
        'hasSGX': parse_bool(row.iloc[10]),
        'ram': parse_int(row.iloc[41]),
        'numberDrives': parse_int(row.iloc[43]),
        'avgSizeDrive': parse_int(row.iloc[44]),
        'storageTotal': parse_int(row.iloc[62]),
        'gpuType': row.iloc[63],
        'gpuMemory': f"{row.iloc[64]}_{row.iloc[65]}",
        'bandwidthNetwork': parse_int(row.iloc[67]),
        'network': parse_int(row.iloc[68]),
        'priceHour': parse_float(row.iloc[74]),
        'priceMonth': parse_float(row.iloc[77]),
        'availability': row.iloc[79],
        'source': row.iloc[80],
        'unit': row.iloc[82],
    }

In [7]:
# Apply the transformation to each row
transformed_data = df.apply(transform_row, axis=1).dropna().tolist()

In [8]:
# Convert the list of dictionaries to a DataFrame
transformed_df = pd.DataFrame(transformed_data)

In [9]:
# Ensure the directory for the database exists
import os
os.makedirs(os.path.dirname(sqlite_db_path), exist_ok=True)

In [10]:
# Write the transformed data to a SQLite database with an auto-incrementing id column
with sqlite3.connect(sqlite_db_path) as conn:
    cursor = conn.cursor()
    # Create table with an auto-incrementing id column
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS providers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            providerName TEXT,
            productName TEXT,
            country TEXT,
            location TEXT,
            cpuCores INTEGER,
            cpuThreads INTEGER,
            cpuGHZ REAL,
            hasSGX BOOLEAN,
            ram INTEGER,
            numberDrives INTEGER,
            avgSizeDrive INTEGER,
            storageTotal INTEGER,
            gpuType TEXT,
            gpuMemory TEXT,
            bandwidthNetwork INTEGER,
            network INTEGER,
            priceHour REAL,
            priceMonth REAL,
            availability TEXT,
            source TEXT,
            unit TEXT
        )
    ''')

    # Insert the DataFrame into the SQLite table
    for row in transformed_df.itertuples(index=False, name=None):
        cursor.execute('''
            INSERT INTO providers (
                providerName, productName, country, location, cpuCores, cpuThreads, cpuGHZ, hasSGX, ram, numberDrives, avgSizeDrive, storageTotal, gpuType, gpuMemory, bandwidthNetwork, network, priceHour, priceMonth, availability, source, unit
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', row)
    
    conn.commit()