In [None]:
import os
import mysql.connector
from mysql.connector import Error
import pandas as pd
from dotenv import load_dotenv

In [None]:
# Load environment variables from .env file
load_dotenv()

def get_db_config():
    """Get database configuration from environment variables."""
    return {
        'host': os.getenv('DB_HOST', 'localhost'),
        'user': os.getenv('DB_USER'),
        'password': os.getenv('DB_PASSWORD'),
        'database': os.getenv('DB_NAME'),
        'port': int(os.getenv('DB_PORT', '3306')),
        'charset': os.getenv('DB_CHARSET', 'utf8mb4'),
        'collation': os.getenv('DB_COLLATION', 'utf8mb4_general_ci')
    }

def create_connection():
    """Create a database connection from .env configuration."""
    connection = None
    try:
        db_config = get_db_config()
        connection = mysql.connector.connect(**db_config)
        if connection.is_connected():
            print(f"Successfully connected to MariaDB at {db_config['host']}")
            print(f"Connected to server version {connection.get_server_info()}")
            print(f"Database: {db_config['database']}")
            return connection
    except Error as e:
        print(f"Error connecting to MariaDB: {e}")
        print("\nConnection details (excluding password):")
        safe_config = {k: v for k, v in get_db_config().items() if k != 'password'}
        print(safe_config)
        return None

def list_tables():
    """List all available tables in the database."""
    connection = create_connection()
    if not connection:
        print("Failed to connect to database")
        return None
    
    try:
        cursor = connection.cursor()
        cursor.execute("SHOW TABLES")
        tables = cursor.fetchall()
        
        if not tables:
            print("No tables found in the database")
            return None
        
        print("\nAvailable tables:")
        for i, table in enumerate(tables, 1):
            print(f"{i}. {table[0]}")
        
        return [table[0] for table in tables]
    
    except Error as e:
        print(f"Error listing tables: {e}")
        return None
    finally:
        if connection.is_connected():
            cursor.close()
            connection.close()
            print("Database connection closed")

def download_table(table_name, output_format='csv'):
    """Download an entire table from MariaDB.
    
    Args:
        table_name: Name of the table to download
        output_format: Format of the output file ('csv' or 'pandas')
        
    Returns:
        DataFrame if output_format is 'pandas', otherwise None
    """
    connection = create_connection()
    if not connection:
        print("Failed to connect to database")
        return None
    
    try:
        cursor = connection.cursor(dictionary=True)
        
        # Check if table exists
        cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
        if not cursor.fetchone():
            print(f"Table '{table_name}' does not exist in the database")
            return None
        
        # Get column names
        cursor.execute(f"DESCRIBE {table_name}")
        columns = [column['Field'] for column in cursor.fetchall()]
        
        # Get all data
        query = f"SELECT * FROM {table_name}"
        cursor.execute(query)
        rows = cursor.fetchall()
        
        if not rows:
            print(f"The table '{table_name}' is empty")
            return None
        
        # Count rows
        cursor.execute(f"SELECT COUNT(*) as count FROM {table_name}")
        row_count = cursor.fetchone()['count']
        print(f"Downloaded {row_count} rows from '{table_name}'")
        
        # Convert to DataFrame
        df = pd.DataFrame(rows)
        
        if output_format == 'csv':
            filename = f"{table_name}.csv"
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            return None
        else:
            return df
    
    except Error as e:
        print(f"Error downloading table: {e}")
        return None
    finally:
        if connection.is_connected():
            cursor.close()
            connection.close()
            print("Database connection closed")

In [None]:
# Example usage - no need for command-line arguments
if __name__ == "__main__":
    # List all tables
    print("Listing all tables:")
    tables = list_tables()
    
    # Download the embeddings table
    print("\nDownloading 'embeddings' table:")
    download_table("embeddings")

# Upload

In [None]:
import os
import mysql.connector
from mysql.connector import Error
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

def get_db_config():
    """Get database configuration from environment variables."""
    return {
        'host': os.getenv('DB_HOST', 'localhost'),
        'user': os.getenv('DB_USER'),
        'password': os.getenv('DB_PASSWORD'),
        'database': os.getenv('DB_NAME'),
        'port': int(os.getenv('DB_PORT', '3306')),
        'charset': os.getenv('DB_CHARSET', 'utf8mb4'),
        'collation': os.getenv('DB_COLLATION', 'utf8mb4_general_ci')
    }

def create_connection():
    """Create a database connection from .env configuration."""
    connection = None
    try:
        db_config = get_db_config()
        connection = mysql.connector.connect(**db_config)
        if connection.is_connected():
            print(f"Successfully connected to MariaDB at {db_config['host']}")
            print(f"Connected to server version {connection.get_server_info()}")
            print(f"Database: {db_config['database']}")
            return connection
    except Error as e:
        print(f"Error connecting to MariaDB: {e}")
        print("\nConnection details (excluding password):")
        safe_config = {k: v for k, v in get_db_config().items() if k != 'password'}
        print(safe_config)
        return None

def upload_csv_to_db(csv_file_path, table_name, if_exists='replace', chunk_size=None):
    """Upload a CSV file to a MariaDB table with improved handling for large tables.
    
    Args:
        csv_file_path: Path to the CSV file
        table_name: Name of the table to create or use
        if_exists: What to do if the table already exists ('replace', 'append', 'fail')
        chunk_size: Process CSV in chunks of this size (None to load entire file)
        
    Returns:
        bool: True if successful, False otherwise
    """
    connection = create_connection()
    if not connection:
        print("Failed to connect to database")
        return False
    
    cursor = None
    try:
        # First, sample the data to understand its structure
        # Read a sample to analyze column types
        sample_size = 1000  # Adjust based on your data size
        sample_df = pd.read_csv(csv_file_path, nrows=sample_size)
        
        print(f"Detected {len(sample_df.columns)} columns in CSV file")
        
        # Create a dictionary to store column types and a mapping for column names
        column_types = {}
        column_name_mapping = {}
        
        # Create mapping from original column names to clean column names
        for column in sample_df.columns:
            clean_column = ''.join(e for e in column if e.isalnum() or e == '_')
            column_name_mapping[column] = clean_column
        
        # Rename columns in the sample dataframe
        sample_df.rename(columns=column_name_mapping, inplace=True)
        
        # Preprocess the sample data to get a better understanding of column types
        for column in sample_df.columns:
            # Check if column contains any non-numeric values
            if sample_df[column].dtype == 'object':
                # If it's a string column, check if it could be numeric
                try:
                    # Try to convert to numeric
                    pd.to_numeric(sample_df[column], errors='raise')
                    # If successful, it's likely a numeric column
                    column_types[column] = "FLOAT"
                except:
                    # If conversion fails, it contains non-numeric values
                    column_types[column] = "TEXT"
            elif pd.api.types.is_integer_dtype(sample_df[column].dtype):
                column_types[column] = "INT"
            elif pd.api.types.is_float_dtype(sample_df[column].dtype):
                column_types[column] = "FLOAT"
            elif pd.api.types.is_bool_dtype(sample_df[column].dtype):
                column_types[column] = "BOOLEAN"
            elif pd.api.types.is_datetime64_any_dtype(sample_df[column].dtype):
                column_types[column] = "DATETIME"
            else:
                column_types[column] = "TEXT"
        
        # Now, process the entire file or in chunks
        if chunk_size:
            # Process in chunks
            df_iterator = pd.read_csv(csv_file_path, chunksize=chunk_size)
            first_chunk = next(df_iterator)
            # Rename columns in first chunk
            first_chunk.rename(columns=column_name_mapping, inplace=True)
            df_chunks = [first_chunk]
            
            # Rename columns in remaining chunks
            remaining_chunks = []
            for chunk in df_iterator:
                chunk.rename(columns=column_name_mapping, inplace=True)
                remaining_chunks.append(chunk)
            
            df_chunks.extend(remaining_chunks)
        else:
            # Read the entire CSV file
            df = pd.read_csv(csv_file_path)
            print(f"Read {len(df)} rows from {csv_file_path}")
            # Rename columns in the full dataframe
            df.rename(columns=column_name_mapping, inplace=True)
            df = df.replace({np.nan: None})
        
        cursor = connection.cursor()
        
        # Check if table exists
        cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
        table_exists = cursor.fetchone() is not None
        
        if table_exists:
            if if_exists == 'fail':
                print(f"Table '{table_name}' already exists. Aborting.")
                return False
            elif if_exists == 'replace':
                print(f"Dropping existing table '{table_name}'")
                cursor.execute(f"DROP TABLE {table_name}")
                connection.commit()
                table_exists = False
        
        # Create table if it doesn't exist
        if not table_exists:
            # Use the column types we detected earlier
            column_defs = []
            
            # Process each column to create the table definition
            for column in sample_df.columns:
                sql_type = column_types.get(column, "TEXT")  # Default to TEXT if not found
                
                # Special handling for MoveIn column which caused the error in the past
                if column == 'MoveIn':
                    sql_type = "TEXT"  # Force TEXT for MoveIn
                
                column_defs.append(f"`{column}` {sql_type}")
            
            # Create table
            create_table_sql = f"CREATE TABLE {table_name} ({', '.join(column_defs)})"
            print(f"Creating table with SQL: {create_table_sql}")
            cursor.execute(create_table_sql)
            connection.commit()
            print(f"Table '{table_name}' created successfully")
        
        # Insert data
        def insert_dataframe(df_to_insert):
            # Prepare the SQL placeholders and column names
            placeholders = ', '.join(['%s'] * len(df_to_insert.columns))
            columns = ', '.join(f'`{col}`' for col in df_to_insert.columns)
            
            insert_sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
            
            # Convert DataFrame to list of tuples
            values = df_to_insert.replace({np.nan: None}).values.tolist()
            
            # Execute in batches to avoid memory issues with large datasets
            batch_size = 1000
            for i in range(0, len(values), batch_size):
                batch = values[i:i+batch_size]
                cursor.executemany(insert_sql, batch)
                connection.commit()
                print(f"Inserted batch {i//batch_size + 1} ({len(batch)} rows)")
        
        # Insert data from chunks or entire dataframe
        if chunk_size:
            total_rows = 0
            for i, chunk in enumerate(df_chunks):
                insert_dataframe(chunk)
                total_rows += len(chunk)
                print(f"Processed chunk {i+1} with {len(chunk)} rows")
            print(f"Successfully uploaded {total_rows} rows to table '{table_name}'")
        else:
            insert_dataframe(df)
            print(f"Successfully uploaded {len(df)} rows to table '{table_name}'")
        
        return True
    
    except Error as e:
        print(f"Error uploading CSV to database: {e}")
        return False
    except Exception as e:
        print(f"General error: {e}")
        import traceback
        traceback.print_exc()
        return False
    finally:
        if cursor:
            cursor.close()
        if connection and connection.is_connected():
            connection.close()
            print("Database connection closed")

# Example usage
if __name__ == "__main__":
    csv_file = r"d:\OneDrive - Green Energy\Desktop\properties_202503181522.csv"  # Replace with your CSV file path
    table_name = "property"  # Replace with your desired table name
    upload_csv_to_db(csv_file, table_name, chunk_size=50)  # Process in chunks of 50 rows