In [33]:
import pandas as pd
import json
import pyodbc
import random

In [50]:
def table_exists(cursor, table_name):
    cursor.execute(f"SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{table_name}'")
    return cursor.fetchone() is not None

def create_table(cursor, table_name, columns, column_types):
    # strip spaces from column names
    columns = [column.strip() for column in columns]
    # replace spaces and special characters with underscores
    columns = [column.replace(' ', '_') for column in columns]
    columns = [column.replace('(', '') for column in columns]
    columns = [column.replace(')', '') for column in columns]
    columns = [column.replace('/', '_') for column in columns]
    columns = [column.replace('-', '_') for column in columns]
    columns = [column.replace('%', 'percent') for column in columns]
    columns = [column.replace('.', '_') for column in columns]
    columns = [column.replace('?', '') for column in columns]
    columns = [column.replace('\'', '') for column in columns]
    columns = [column.replace('\"', '') for column in columns]
    columns = [column.replace(',', '') for column in columns]
    columns = [column.replace('&', 'and') for column in columns]
    columns = [column.replace('__', '_') for column in columns]
    columns = [column.strip() for column in columns]
    # create column string for SQL query
    column_str = ', '.join([f'{col} {column_types[i]}' for i, col in enumerate(columns)])
    create_query = f"CREATE TABLE {table_name} ({column_str})"
    cursor.execute(create_query)

def get_column_types(df):
    column_types = []
    for col in df.columns:
        column_type = df[col].dtype
        # Convert pandas types to SQL Server types
        if column_type == 'int64':
            column_types.append('INT')
        elif column_type == 'float64':
            column_types.append('FLOAT')
        else:
            # If the type is object or any other unrecognized type, use NVARCHAR
            column_types.append('NVARCHAR(MAX)')
    return column_types

def split_data(csv_file, json_file, database_name, table_name, json_split=0.3, db_split=0.3, csv_split=0.4):
    # Load data from CSV file using pandas
    df = pd.read_csv(csv_file)
    
    # Shuffle the rows to ensure random data distribution
    df = df.sample(frac=1, random_state=42)
    
    # Calculate the number of rows needed for each destination
    total_rows = len(df)
    json_rows = int(total_rows * json_split)
    db_rows = int(total_rows * db_split)
    csv_rows = total_rows - json_rows - db_rows
    
    # Split the data into JSON, database, and CSV
    json_data = df.iloc[:json_rows]
    db_data = df.iloc[json_rows:json_rows + db_rows]
    csv_data = df.iloc[json_rows + db_rows:]
    
    # Save the JSON data to a JSON file
    json_data.to_json(json_file, orient='records', lines=True)
    
    # Save the CSV data to a CSV file (without header)
    csv_data.to_csv('data/splitted/output.csv', index=False)
    
    # Get column types for the database table
    column_types = get_column_types(df)
    
    # Save the database data to the specified database and table
    connection_string = f'DRIVER=SQL Server;SERVER=LAPTOP-K8C2EPLP\\SQLEXPRESS;DATABASE={database_name};Trusted_Connection=yes;'
    with pyodbc.connect(connection_string) as conn:
        with conn.cursor() as cursor:
            # Create the table if it doesn't exist
            if not table_exists(cursor, table_name):
                create_table(cursor, table_name, df.columns, column_types)
                conn.commit()
            # Insert the data into the table
            db_data = db_data.where(pd.notna(db_data), None)
            # get the values as a list of tuples
            values = [tuple(row) for row in db_data.values]
            # create the query string with the correct number of placeholders
            placeholders = ', '.join(['?'] * len(df.columns))
            query = f'INSERT INTO {table_name} VALUES ({placeholders})'
            # execute the query
            cursor.executemany(query, values)
            conn.commit()

        


In [51]:

# Example usage
split_data('data/Electric_Vehicle_Population_Size_History_By_County.csv', 'data/splitted/json.json', 'splitted', 'Vehicules', json_split=0.3, db_split=0.3, csv_split=0.4)
