In [1]:
!source elsds/bin/activate
!pip install elasticsearch pandas numpy
from elasticsearch import Elasticsearch
import pandas as pd
import ast
import numpy as np
import json



In [3]:
# --------------------------------------------
# Load the DataFrame from actual_data.csv
# --------------------------------------------
df = pd.read_csv("actual_data.csv")

# --------------------------------------------
# Step 1: Create a Normalized Version of the DataFrame
# --------------------------------------------

# Make a copy of the data.
df_norm = df.copy()

# Function to normalize numeric data (min-max scaling).
def normalize_numeric(series):
    # If the series is boolean, convert it to integer.
    if pd.api.types.is_bool_dtype(series):
        series = series.astype(int)
    min_val = series.min()
    max_val = series.max()
    # Avoid division by zero
    if max_val - min_val != 0:
        return (series - min_val) / (max_val - min_val)
    else:
        return 0.0

# Create a dictionary to hold mapping information for each column.
mapping_dict = {}

# Iterate over each column and transform its values.
for col in df_norm.columns:
    # If column contains list values, convert them to strings.
    if df_norm[col].apply(lambda x: isinstance(x, list)).any():
        df_norm[col] = df_norm[col].apply(lambda x: str(x) if isinstance(x, list) else x)
    
    # Process datetime columns: convert to integer timestamps then normalize.
    if pd.api.types.is_datetime64_any_dtype(df_norm[col]):
        series_int = df_norm[col].astype('int64')
        min_val = series_int.min()
        max_val = series_int.max()
        # Save mapping for reverse normalization.
        mapping_dict[col] = {"type": "datetime", "min": int(min_val), "max": int(max_val)}
        df_norm[col] = normalize_numeric(series_int)
    
    # Process numeric columns.
    elif pd.api.types.is_numeric_dtype(df_norm[col]):
        min_val = df_norm[col].min()
        max_val = df_norm[col].max()
        mapping_dict[col] = {"type": "numeric", "min": min_val, "max": max_val}
        df_norm[col] = normalize_numeric(df_norm[col])
    
    # Process categorical/object columns.
    else:
        df_norm[col] = df_norm[col].astype(str)
        codes, uniques = pd.factorize(df_norm[col])
        mapping = {}
        n = len(uniques)
        for i, val in enumerate(uniques):
            # Calculate normalized value for each unique category.
            if n > 1:
                norm_value = i / (n - 1)
            else:
                norm_value = 0.0
            mapping[str(norm_value)] = val
        mapping_dict[col] = {"type": "categorical", "mapping": mapping}
        if n > 1:
            df_norm[col] = codes / (n - 1)
        else:
            df_norm[col] = 0.0

# --------------------------------------------
# Step 2: Write the Normalized Data to CSV
# --------------------------------------------
normalized_csv = "normalized_data.csv"
df_norm.to_csv(normalized_csv, index=False, encoding="utf-8")
print(f"Normalized data exported to {normalized_csv}")

# --------------------------------------------
# Step 3: Write the Mapping Data to a CSV
# --------------------------------------------
# For each column, record the mapping information needed for reverse normalization.
mapping_data = []
for col, info in mapping_dict.items():
    # For numeric and datetime types, record min and max.
    if info["type"] in ["numeric", "datetime"]:
        mapping_data.append({
            "column": col,
            "type": info["type"],
            "min": info["min"],
            "max": info["max"]
        })
    # For categorical columns, store the mapping as a JSON string.
    elif info["type"] == "categorical":
        mapping_data.append({
            "column": col,
            "type": info["type"],
            "mapping": json.dumps(info["mapping"])
        })

mapping_df = pd.DataFrame(mapping_data)
mapping_csv = "mapping_data.csv"
mapping_df.to_csv(mapping_csv, index=False, encoding="utf-8")
print(f"Mapping data exported to {mapping_csv}")


Normalized data exported to normalized_data.csv
Mapping data exported to mapping_data.csv
