In [40]:
# Importing standard libraries
import os

# Importing third party libraries
import yaml
import pandas as pd

# Importing custom libraries
from tools.sql_tools import write_to_database
from tools.logs import log_wrap

In [41]:
#-----------FOR THE BOIS----------------------

#--------LIBRARIES FOR THE BOIS (THE ONES I USED, THE USUAL ONES)-----------

#COOL LIBRARIES
import pandas as pd
import numpy as np
import os
from tools.logs import log_wrap
import logging



# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Create logger instance
logger = logging.getLogger(__name__)




# SET PATHS OF 3 COOL CSVs
FILENAME_sales_phases_funnel_df = os.path.join(os.getcwd(), r'data/sale_phases_funnel.csv')
FILENAME_zipcode_df = os.path.join(os.getcwd(), r'data/zipcode_eae.csv')
FILENAME_meteo_df = os.path.join(os.getcwd(), r'data/meteo_eae.csv')


#SETTING TYPES
#Sales
SALES_TYPES = {'LEAD_ID':'str','FINANCING_TYPE':'str',
                    'CURRENT_PHASE':'str','PHASE_PRE_KO':'str',
                    'IS_MODIFIED':'bool','ZIPCODE':'str', 
                    'VISITING_COMPANY': 'str', 'KO_REASON': 'str', 
                    'INSTALLATION_PEAK_POWER_KW': 'float64', 
                    'INSTALLATION_PRICE': 'float', 
                    'N_PANELS': 'int', 'CUSOMER_TYPE': 'str' }

#Zipcosdes
ZIPCODE_TYPES = {'ZIPCODE':'str','ZC_LATITUDE':'float64',
                    'ZC_LONGITUDE':'float64','AUTONOMOUS_COMMUNITY':'str',
                    'AUTONOMOUS_COMMUNITY_NK':'str','PROVINCE':'str'}

#Meteo
METEO_TYPES = {'temperature': 'float', 'relative_humidity': 'float', 
            'precipitation_rate': 'float', 'wind_speed': 'float', 
            'zipcode': 'str' 
}





#----------FUNCTIONS FOR THE BOIS-----------

#CREATE 3 COOL DATAFRAMES FUNCTION
#Creates 3 super cool dataframes from the CSVs with the data types set from the start.
def dataFrameCreate():

    #SALES FUNNEL DATAFRAME

    #Dictionary with data types
   

    #Reading CSV to create dataframe with datatypes implemented from dictionary and additional date time datatypes.
    sales_phases_funnel_df = pd.read_csv(
        FILENAME_sales_phases_funnel_df, 
        delimiter=';', 
        dtype=SALES_TYPES,
        parse_dates=['OFFER_SENT_DATE', 'CONTRACT_1_DISPATCH_DATE', 
                    'CONTRACT_2_DISPATCH_DATE', 
                    'CONTRACT_1_SIGNATURE_DATE', 
                    'CONTRACT_2_SIGNATURE_DATE',
                    'VISIT_DATE',
                    'TECHNICAL_REVIEW_DATE',
                    'PROJECT_VALIDATION_DATE',
                    'SALE_DISMISSAL_DATE',
                    'KO_DATE'],
                    
        dayfirst=True  # This replaces the dayfirst=True in your to_datetime call
    )

    logger.info('sales_phases_funnel_df created')



    #ZIPCODE DATAFRAME

    # Reading CSV to create dataframe with datatypes implemented from dictionary
    zipcode_df = pd.read_csv(FILENAME_zipcode_df, delimiter=',', dtype=ZIPCODE_TYPES)


    logger.info('zipcodedf created')




    #METEO DATAFRAME

    # Reading CSV to create dataframe with datatypes implemented from dictionary and 
    # additional date time datatype formatted to match the ones from the sales dataframe.
    meteo_df = pd.read_csv(FILENAME_meteo_df, delimiter=';',
        dtype=METEO_TYPES, parse_dates=['date'],  # Replace with actual column name
        date_format='%Y/%m/%d %H:%M:%S.%f'  # This matches your input format
    )

    logger.info('meteo_df created')
    list_of_dfs = [sales_phases_funnel_df, zipcode_df, meteo_df]

    return list_of_dfs
    



list_of_dfs = dataFrameCreate()




#--GLOBAL CLEANING FUNCTION--


#DROPPING DUPLICATES FOR ALL DATAFRAMES

#creating the drop duplicate function
def dropDupli(dfs):
    #log
    logger.info(f'There are {dfs[0].duplicated().sum()} duplicate rows in sales_funnel_df before duplicate cleaning') 
    logger.info(f'There are {dfs[1].duplicated().sum()} duplicate rows in zipcode_df before duplicate cleaning')
    logger.info(f'There are {dfs[2].duplicated().sum()} duplicate rows in meteo_df before duplicate cleaning')  
    #DroppingDupli
    dfs[0].drop_duplicates(inplace=True)
    dfs[1].drop_duplicates(inplace=True)
    dfs[2].drop_duplicates(inplace=True)
    # Log after
    logger.info(f'There are {dfs[0].duplicated().sum()} duplicate rows in sales_funnel_df after duplicate cleaning') 
    logger.info(f'There are {dfs[1].duplicated().sum()} duplicate rows in zipcode_df after duplicate cleaning')
    logger.info(f'There are {dfs[2].duplicated().sum()} duplicate rows in meteo_df after duplicate cleaning')

    return dfs  # Return dfs instead of undefined variables



list_of_dfs = dropDupli(list_of_dfs)



# --SALES FUNNEL DATAFRAME CLEANING FUCTIONS--


#DELETE UNUSABLE LEADS FUNCTION 

# Drop rows where KO_REASON is "Unreachable"
def delete_unreachable_leads(dfs):
    dfs[0] = dfs[0][~((dfs[0]['CURRENT_PHASE'] == 'KO') & (dfs[0]['KO_REASON'] == 'Unreachable'))]
    # Reset the index of the updated DataFrame
    dfs[0].reset_index(drop=True, inplace=True)
    logger.info('Unreachable leads deleted')
    return dfs

list_of_dfs = delete_unreachable_leads(list_of_dfs)




# REMOVE OUTLIERS FUNCTION


def delete_outliers(dfs):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = dfs[0]['INSTALLATION_PRICE'].quantile(0.25)
    Q3 = dfs[0]['INSTALLATION_PRICE'].quantile(0.75)
    

    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Identify outliers inside a new Data Frame
    outliers_df = dfs[0][(dfs[0]['INSTALLATION_PRICE'] < (Q1 - 1.5 * IQR)) | 
                                     (dfs[0]['INSTALLATION_PRICE'] > (Q3 + 1.5 * IQR))]

    # Print the number of outliers
    logger.info('outliers_df dataframe created')
    logger.info(f'Number of outliers: {len(outliers_df)}')
    
    # Update sales_phases_funnel_df to exclude the outliers
    sales_phases_funnel_df = dfs[0][~((dfs[0]['INSTALLATION_PRICE'] < (Q1 - 1.5 * IQR)) | 
                                                  (dfs[0]['INSTALLATION_PRICE'] > (Q3 + 1.5 * IQR)))]
    logger.info('outliers removed from sales_phases_funnel_df')
    dfs.append(outliers_df)
    logger.info(f'outliers_df dataframe added to list_of_dfs')
    return dfs
    


list_of_dfs = delete_outliers(list_of_dfs)


#This is how you would use it in th final etl:
'''
from tools.cleaning import dataFrameCreate, dropDupli, delete_unreachable_leads, delete_outliers



list_of_dfs = dataFrameCreate()
list_of_dfs = dropDupli(list_of_dfs)
list_of_dfs = delete_unreachable_leads(list_of_dfs) #this one we may exclude
list_of_dfs = delete_outliers(list_of_dfs)

print(list_of_dfs)
'''

2025-03-08 06:57:14,429 - __main__ - INFO - sales_phases_funnel_df created
2025-03-08 06:57:14,438 - __main__ - INFO - zipcodedf created
2025-03-08 06:57:16,225 - __main__ - INFO - meteo_df created
2025-03-08 06:57:16,273 - __main__ - INFO - There are 0 duplicate rows in sales_funnel_df before duplicate cleaning
2025-03-08 06:57:16,276 - __main__ - INFO - There are 0 duplicate rows in zipcode_df before duplicate cleaning
2025-03-08 06:57:17,232 - __main__ - INFO - There are 0 duplicate rows in meteo_df before duplicate cleaning
2025-03-08 06:57:18,247 - __main__ - INFO - There are 0 duplicate rows in sales_funnel_df after duplicate cleaning
2025-03-08 06:57:18,251 - __main__ - INFO - There are 0 duplicate rows in zipcode_df after duplicate cleaning
2025-03-08 06:57:19,211 - __main__ - INFO - There are 0 duplicate rows in meteo_df after duplicate cleaning
2025-03-08 06:57:19,220 - __main__ - INFO - Unreachable leads deleted
2025-03-08 06:57:19,222 - __main__ - INFO - outliers_df datafra

'\nfrom tools.cleaning import dataFrameCreate, dropDupli, delete_unreachable_leads, delete_outliers\n\n\n\nlist_of_dfs = dataFrameCreate()\nlist_of_dfs = dropDupli(list_of_dfs)\nlist_of_dfs = delete_unreachable_leads(list_of_dfs) #this one we may exclude\nlist_of_dfs = delete_outliers(list_of_dfs)\n\nprint(list_of_dfs)\n'

In [42]:
# Final Column Names
FINAL_NAMES_WEATHER = {'date':'year','temperature': 'avg_temperature', 'relative_humidity': 'avg_relative_humidity',
                                               'precipitation_rate':'avg_precipitation_rate','wind_speed':'avg_wind_speed'}

# Final Columns
FINAL_COLS_SALES = ['sales_id','zipcode_id','lead_id','financing_type','current_phase','phase_pre_ko',
              'is_modified','offer_sent_date','contract_1_dispatch_date','contract_2_dispatch_date','contract_1_signature_date',
              'contract_2_signature_date','most_recent_contract_signature','visit_date','technical_review_date',
              'project_validation_date','sale_dismissal_date','ko_date','visiting_company','ko_reason',
              'installation_peak_power_kw','installation_price','n_panels','cusomer_type']
FINAL_COLS_WEATHER=['weather_id','zipcode_id','year','avg_temperature','avg_relative_humidity','avg_precipitation_rate',
                    'avg_wind_speed']

In [43]:
@log_wrap
def transform_data(data: list, logger) -> list:
    '''
    Takes a list of dfs as arguments of size 3 and returns a list of transformed dataframes
    Order:
    [0] = Sales
    [1] = Zipcode
    [2] = Weather
    '''
    try:
        logger.info('Reading Dataframes...')
        sales_fact_df_raw = data[0]
        logger.info(f'Sales Data has {len(sales_fact_df_raw)} records.')
        zipcode_dim_df_raw = data[1]
        logger.info(f'Zipcode Data has {len(zipcode_dim_df_raw)} records.')
        weather_dim_df_raw = data[2]
        logger.info(f'Weather Data has {len(weather_dim_df_raw)} records.')
        
        logger.info(f'Processing Transformations...')
        sales_fact_df_raw.columns = sales_fact_df_raw.columns.str.lower()
        zipcode_dim_df_raw.columns = zipcode_dim_df_raw.columns.str.lower()
        weather_dim_df_raw.columns = weather_dim_df_raw.columns.str.lower()
        
        logger.info(f'Creating a PK in zipcode_dim_df_raw...')
        zipcode_dim_df_raw.insert(0,'zipcode_id',range(1, len(zipcode_dim_df_raw) + 1))
        zipcode_dim_df_raw['zipcode_id'] = zipcode_dim_df_raw['zipcode_id'].astype('int32')
        zipcode_dim_df = zipcode_dim_df_raw
        
        logger.info(f'Grouping weather_dim_df_raw...')
        weather_dim_df_raw['date'] = weather_dim_df_raw['date'].dt.year
        weather_dim_df_raw = weather_dim_df_raw.groupby(['date','zipcode']).mean().reset_index()
        
        logger.info(f'Adding FK zipcode_id in weather table...')
        weather_dim_df = pd.merge(weather_dim_df_raw,zipcode_dim_df_raw,on= 'zipcode', how='left')
        
        logger.info(f'Dropping null zipcode_id from weather table...')
        weather_dim_df = weather_dim_df.dropna()
        
        logger.info(f'Creating a PK in weather_dim_df_raw...')
        weather_dim_df.insert(0,'weather_id',range(1, len(weather_dim_df) + 1))
        weather_dim_df['weather_id'] = weather_dim_df['weather_id'].astype('int32')
        
        logger.info(f'Creating a PK in sales_fact_df_raw...')
        sales_fact_df_raw.insert(0,'sales_id',range(1, len(sales_fact_df_raw) + 1))
        sales_fact_df_raw['sales_id'] = sales_fact_df_raw['sales_id'].astype('int32')
        
        logger.info(f'Adding calculated column most_recent_contract_signature to sales_fact_df...')
        sales_fact_df_raw.insert(16,'most_recent_contract_signature', \
            sales_fact_df_raw[['contract_1_signature_date', 'contract_2_signature_date']].max(axis=1))

        logger.info(f'Adding FK zipcode_id in sales table...')
        sales_fact_df = pd.merge(sales_fact_df_raw, zipcode_dim_df_raw, on='zipcode', how='left')
        
        logger.info(f'Handling column types, names and selection...')
        weather_dim_df = weather_dim_df.rename(columns=FINAL_NAMES_WEATHER)
        weather_dim_df = weather_dim_df[FINAL_COLS_WEATHER]
        sales_fact_df = sales_fact_df[FINAL_COLS_SALES]        
        
        logger.info(f'Packing data for loading...')
        list_of_transformed_dfs = [zipcode_dim_df,weather_dim_df, sales_fact_df]
        
        return list_of_transformed_dfs
    
    except Exception as e:
        logger.error(f'Transformation error: {e}', exc_info=True)
        raise


In [44]:
transform_data(list_of_dfs)

2025-03-08 06:57:19,248 - transform_data - INFO - Executing function: transform_data
2025-03-08 06:57:19,248 - transform_data - INFO - Reading Dataframes...
2025-03-08 06:57:19,249 - transform_data - INFO - Sales Data has 28864 records.
2025-03-08 06:57:19,250 - transform_data - INFO - Zipcode Data has 11407 records.
2025-03-08 06:57:19,251 - transform_data - INFO - Weather Data has 4114206 records.
2025-03-08 06:57:19,251 - transform_data - INFO - Processing Transformations...
2025-03-08 06:57:19,253 - transform_data - INFO - Creating a PK in zipcode_dim_df_raw...
2025-03-08 06:57:19,255 - transform_data - INFO - Grouping weather_dim_df_raw...
2025-03-08 06:57:19,513 - transform_data - INFO - Adding FK zipcode_id in weather table...
2025-03-08 06:57:19,519 - transform_data - INFO - Dropping null zipcode_id from weather table...
2025-03-08 06:57:19,521 - transform_data - INFO - Creating a PK in weather_dim_df_raw...
2025-03-08 06:57:19,521 - transform_data - INFO - Creating a PK in sal

[       zipcode_id zipcode  zc_latitude  zc_longitude autonomous_community  \
 0               1   01001      42.8500       -2.6667           Pais Vasco   
 1               2   07119      39.6833        2.7000             Baleares   
 2               3   07110      39.6833        2.7000             Baleares   
 3               4   07109      39.7833        2.7333             Baleares   
 4               5   07108      39.8000        2.6833             Baleares   
 ...           ...     ...          ...           ...                  ...   
 11402       11403   49543      41.6667       -6.0333      Castilla - Leon   
 11403       11404   49542      41.7167       -6.1500      Castilla - Leon   
 11404       11405   49541      41.6833       -6.0833      Castilla - Leon   
 11405       11406   49540      41.7500       -5.9833      Castilla - Leon   
 11406       11407   49539      41.6167       -5.5333      Castilla - Leon   
 
       autonomous_community_nk       province  
 0            

In [45]:
# Importing standard libraries
import os

# Importing third party libraries
import yaml
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine

In [46]:
FILENAME = os.path.join(os.getcwd(), 'creds.yaml')

In [47]:
with open(FILENAME, "r") as file:
    creds = yaml.safe_load(file)

In [48]:
QUERY_CREATE_ZIPCODE_DIM_TABLE ='''
CREATE TABLE IF NOT EXISTS zipcode_dim (
  zipcode_id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
  zipcode varchar(50) NOT NULL,
  zc_latitude float NOT NULL,
  zc_longitude float NOT NULL,
  autonomous_community varchar(50) NOT NULL,
  autonomous_community_nk varchar(50) NOT NULL,
  province varchar(50) NOT NULL
);
'''

In [49]:
QUERY_CREATE_WEATHER_DIM_TABLE = '''
CREATE TABLE IF NOT EXISTS weather_dim (
  weather_id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
  zipcode_id int NOT NULL,
  year int NOT NULL,
  avg_temperature float NOT NULL,
  avg_relative_humidity float NOT NULL,
  avg_precipitation_rate float NOT NULL,
  avg_wind_speed float NOT NULL,
  Foreign Key (zipcode_id) references zipcode_dim(zipcode_id)
);
'''

In [50]:
QUERY_CREATE_SALES_FT_TABLE = '''
CREATE TABLE IF NOT EXISTS sales_fact (
  sales_id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
  zipcode_id int NOT NULL,
  lead_id varchar(50) NOT NULL,
  financing_type varchar(50) NOT NULL,
  current_phase varchar(50) NOT NULL,
  phase_pre_ko varchar(50) NOT NULL,
  is_modified int,
  offer_sent_date date NOT NULL,
  contract_1_dispatch_date date NOT NULL,
  contract_2_dispatch_date date NOT NULL,
  contract_1_signature_date date NOT NULL,
  contract_2_signature_date date NOT NULL,
  most_recent_contract_signature_date date NOT NULL,
  visit_date date NOT NULL,
  technical_review_date date NOT NULL,
  project_validation_date date NOT NULL,
  sale_dismissal_date date NOT NULL,
  ko_date date NOT NULL,
  visiting_company varchar(50) NOT NULL,
  ko_reason varchar(50) NOT NULL,
  installation_peak_power_kwf float NOT NULL,
  installation_price float NOT NULL,
  n_panels smallint,
  customer_type varchar(50) NOT NULL,
  Foreign Key (zipcode_id) references zipcode_dim(zipcode_id)
);
'''						

In [51]:
def create_table():
    connection = mysql.connector.connect(
        user = creds['mysql-db']['username'],
        password = creds['mysql-db']['password'],
        host = creds['mysql-db']['host'],
        database = creds['mysql-db']['database'],
    )
    cursor = connection.cursor()
    
    cursor.execute(QUERY_CREATE_ZIPCODE_DIM_TABLE)
    cursor.execute(QUERY_CREATE_WEATHER_DIM_TABLE)
    cursor.execute(QUERY_CREATE_SALES_FT_TABLE)
    connection.commit()
    print("Table structures created successfully.")
    
    cursor.close()
    connection.close()

In [52]:
dfs_dict = {
        "zipcode_dim": list_of_transformed_dfs[0],
        "weather_dim": list_of_transformed_dfs[1],
        "sales_fact": list_of_transformed_dfs[2]
    }

NameError: name 'list_of_transformed_dfs' is not defined

In [37]:
def write_to_database(dfs_dict, if_exists='append'):
    """
    Write a dataframe into a MySql table.

    Args:
        dfs_dict: The list of tables to load to along with the dfs to insert
        if_exists (str): Default 'append'
    """

    _db_user = creds['username']
    _db_password = creds['password']
    _db_host = creds['host']
    _db_name = creds['database']
    engine = create_engine(f"mysql+pymysql://{_db_user}:{_db_password}@{_db_host}:3306/{_db_name}")
    with engine.connect() as connection:
        for table_name, df in dfs_dict.items():
            if isinstance(df, pd.DataFrame): 
                df.to_sql(table_name, con=connection, if_exists=if_exists, index=False)
                print(f"Data successfully inserted into {table_name}")
            else:
                print(f"Skipping {table_name}: Not a valid DataFrame")
    
    # return logger.info("Completed uploading all data..")

In [38]:
if __name__ == "__main__":
    create_table()
    write_to_database(dfs_dict)

Table structures created successfully.


NameError: name 'dfs_dict' is not defined

In [54]:
#-----------FOR THE BOIS----------------------

#--------LIBRARIES FOR THE BOIS (THE ONES I USED, THE USUAL ONES)-----------

#COOL LIBRARIES
import pandas as pd
import numpy as np
import os
from tools.logs import log_wrap
import logging



# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Create logger instance
logger = logging.getLogger(__name__)




# SET PATHS OF 3 COOL CSVs
FILENAME_sales_phases_funnel_df = os.path.join(os.getcwd(), r'data/sale_phases_funnel.csv')
FILENAME_zipcode_df = os.path.join(os.getcwd(), r'data/zipcode_eae.csv')
FILENAME_meteo_df = os.path.join(os.getcwd(), r'data/meteo_eae.csv')


#SETTING TYPES
#Sales
SALES_TYPES = {'LEAD_ID':'str','FINANCING_TYPE':'str',
                    'CURRENT_PHASE':'str','PHASE_PRE_KO':'str',
                    'IS_MODIFIED':'bool','ZIPCODE':'str', 
                    'VISITING_COMPANY': 'str', 'KO_REASON': 'str', 
                    'INSTALLATION_PEAK_POWER_KW': 'float64', 
                    'INSTALLATION_PRICE': 'float', 
                    'N_PANELS': 'int', 'CUSOMER_TYPE': 'str' }

#Zipcosdes
ZIPCODE_TYPES = {'ZIPCODE':'str','ZC_LATITUDE':'float64',
                    'ZC_LONGITUDE':'float64','AUTONOMOUS_COMMUNITY':'str',
                    'AUTONOMOUS_COMMUNITY_NK':'str','PROVINCE':'str'}

#Meteo
METEO_TYPES = {'temperature': 'float', 'relative_humidity': 'float', 
            'precipitation_rate': 'float', 'wind_speed': 'float', 
            'zipcode': 'str' 
}





#----------FUNCTIONS FOR THE BOIS-----------

#CREATE 3 COOL DATAFRAMES FUNCTION
#Creates 3 super cool dataframes from the CSVs with the data types set from the start.
def dataFrameCreate():

    #SALES FUNNEL DATAFRAME

    #Dictionary with data types
   

    #Reading CSV to create dataframe with datatypes implemented from dictionary and additional date time datatypes.
    sales_phases_funnel_df = pd.read_csv(
        FILENAME_sales_phases_funnel_df, 
        delimiter=';', 
        dtype=SALES_TYPES,
        parse_dates=['OFFER_SENT_DATE', 'CONTRACT_1_DISPATCH_DATE', 
                    'CONTRACT_2_DISPATCH_DATE', 
                    'CONTRACT_1_SIGNATURE_DATE', 
                    'CONTRACT_2_SIGNATURE_DATE',
                    'VISIT_DATE',
                    'TECHNICAL_REVIEW_DATE',
                    'PROJECT_VALIDATION_DATE',
                    'SALE_DISMISSAL_DATE',
                    'KO_DATE'],
                    
        dayfirst=True  # This replaces the dayfirst=True in your to_datetime call
    )

    logger.info('sales_phases_funnel_df created')



    #ZIPCODE DATAFRAME

    # Reading CSV to create dataframe with datatypes implemented from dictionary
    zipcode_df = pd.read_csv(FILENAME_zipcode_df, delimiter=',', dtype=ZIPCODE_TYPES)


    logger.info('zipcodedf created')




    #METEO DATAFRAME

    # Reading CSV to create dataframe with datatypes implemented from dictionary and 
    # additional date time datatype formatted to match the ones from the sales dataframe.
    meteo_df = pd.read_csv(FILENAME_meteo_df, delimiter=';',
        dtype=METEO_TYPES, parse_dates=['date'],  # Replace with actual column name
        date_format='%Y/%m/%d %H:%M:%S.%f'  # This matches your input format
    )

    logger.info('meteo_df created')
    list_of_dfs = [sales_phases_funnel_df, zipcode_df, meteo_df]

    return list_of_dfs
    



list_of_dfs = dataFrameCreate()




#--GLOBAL CLEANING FUNCTION--


#DROPPING DUPLICATES FOR ALL DATAFRAMES

#creating the drop duplicate function
def dropDupli(dfs):
    #log
    logger.info(f'There are {dfs[0].duplicated().sum()} duplicate rows in sales_funnel_df before duplicate cleaning') 
    logger.info(f'There are {dfs[1].duplicated().sum()} duplicate rows in zipcode_df before duplicate cleaning')
    logger.info(f'There are {dfs[2].duplicated().sum()} duplicate rows in meteo_df before duplicate cleaning')  
    #DroppingDupli
    dfs[0].drop_duplicates(inplace=True)
    dfs[1].drop_duplicates(inplace=True)
    dfs[2].drop_duplicates(inplace=True)
    # Log after
    logger.info(f'There are {dfs[0].duplicated().sum()} duplicate rows in sales_funnel_df after duplicate cleaning') 
    logger.info(f'There are {dfs[1].duplicated().sum()} duplicate rows in zipcode_df after duplicate cleaning')
    logger.info(f'There are {dfs[2].duplicated().sum()} duplicate rows in meteo_df after duplicate cleaning')

    return dfs  # Return dfs instead of undefined variables



list_of_dfs = dropDupli(list_of_dfs)



# --SALES FUNNEL DATAFRAME CLEANING FUCTIONS--


#DELETE UNUSABLE LEADS FUNCTION 

# Drop rows where KO_REASON is "Unreachable"
def delete_unreachable_leads(dfs):
    dfs[0] = dfs[0][~((dfs[0]['CURRENT_PHASE'] == 'KO') & (dfs[0]['KO_REASON'] == 'Unreachable'))]
    # Reset the index of the updated DataFrame
    dfs[0].reset_index(drop=True, inplace=True)
    logger.info('Unreachable leads deleted')
    return dfs

list_of_dfs = delete_unreachable_leads(list_of_dfs)




# REMOVE OUTLIERS FUNCTION


def delete_outliers(dfs):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = dfs[0]['INSTALLATION_PRICE'].quantile(0.25)
    Q3 = dfs[0]['INSTALLATION_PRICE'].quantile(0.75)
    

    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Identify outliers inside a new Data Frame
    outliers_df = dfs[0][(dfs[0]['INSTALLATION_PRICE'] < (Q1 - 1.5 * IQR)) | 
                                     (dfs[0]['INSTALLATION_PRICE'] > (Q3 + 1.5 * IQR))]

    # Print the number of outliers
    logger.info('outliers_df dataframe created')
    logger.info(f'Number of outliers: {len(outliers_df)}')
    
    # Update sales_phases_funnel_df to exclude the outliers
    sales_phases_funnel_df = dfs[0][~((dfs[0]['INSTALLATION_PRICE'] < (Q1 - 1.5 * IQR)) | 
                                                  (dfs[0]['INSTALLATION_PRICE'] > (Q3 + 1.5 * IQR)))]
    logger.info('outliers removed from sales_phases_funnel_df')
    dfs.append(outliers_df)
    logger.info(f'outliers_df dataframe added to list_of_dfs')
    return dfs
    


list_of_dfs = delete_outliers(list_of_dfs)


#This is how you would use it in th final etl:
'''
from tools.cleaning import dataFrameCreate, dropDupli, delete_unreachable_leads, delete_outliers



list_of_dfs = dataFrameCreate()
list_of_dfs = dropDupli(list_of_dfs)
list_of_dfs = delete_unreachable_leads(list_of_dfs) #this one we may exclude
list_of_dfs = delete_outliers(list_of_dfs)

print(list_of_dfs)
'''




@log_wrap
def transform_data(data: list, logger) -> list:
    '''
    Takes a list of dfs as arguments of size 3 and returns a list of transformed dataframes
    Order:
    [0] = Sales
    [1] = Zipcode
    [2] = Weather
    '''
    try:
        logger.info('Reading Dataframes...')
        sales_fact_df_raw = data[0]
        logger.info(f'Sales Data has {len(sales_fact_df_raw)} records.')
        zipcode_dim_df_raw = data[1]
        logger.info(f'Zipcode Data has {len(zipcode_dim_df_raw)} records.')
        weather_dim_df_raw = data[2]
        logger.info(f'Weather Data has {len(weather_dim_df_raw)} records.')
        
        logger.info(f'Processing Transformations...')
        sales_fact_df_raw.columns = sales_fact_df_raw.columns.str.lower()
        zipcode_dim_df_raw.columns = zipcode_dim_df_raw.columns.str.lower()
        weather_dim_df_raw.columns = weather_dim_df_raw.columns.str.lower()
        
        logger.info(f'Creating a PK in zipcode_dim_df_raw...')
        zipcode_dim_df_raw.insert(0,'zipcode_id',range(1, len(zipcode_dim_df_raw) + 1))
        zipcode_dim_df_raw['zipcode_id'] = zipcode_dim_df_raw['zipcode_id'].astype('int32')
        zipcode_dim_df = zipcode_dim_df_raw
        
        logger.info(f'Grouping weather_dim_df_raw...')
        weather_dim_df_raw['date'] = weather_dim_df_raw['date'].dt.year
        weather_dim_df_raw = weather_dim_df_raw.groupby(['date','zipcode']).mean().reset_index()
        
        logger.info(f'Adding FK zipcode_id in weather table...')
        weather_dim_df = pd.merge(weather_dim_df_raw,zipcode_dim_df_raw,on= 'zipcode', how='left')
        
        logger.info(f'Dropping null zipcode_id from weather table...')
        weather_dim_df = weather_dim_df.dropna()
        
        logger.info(f'Creating a PK in weather_dim_df_raw...')
        weather_dim_df.insert(0,'weather_id',range(1, len(weather_dim_df) + 1))
        weather_dim_df['weather_id'] = weather_dim_df['weather_id'].astype('int32')
        
        logger.info(f'Creating a PK in sales_fact_df_raw...')
        sales_fact_df_raw.insert(0,'sales_id',range(1, len(sales_fact_df_raw) + 1))
        sales_fact_df_raw['sales_id'] = sales_fact_df_raw['sales_id'].astype('int32')
        
        logger.info(f'Adding calculated column most_recent_contract_signature to sales_fact_df...')
        sales_fact_df_raw.insert(16,'most_recent_contract_signature', \
            sales_fact_df_raw[['contract_1_signature_date', 'contract_2_signature_date']].max(axis=1))

        logger.info(f'Adding FK zipcode_id in sales table...')
        sales_fact_df = pd.merge(sales_fact_df_raw, zipcode_dim_df_raw, on='zipcode', how='left')
        
        logger.info(f'Handling column types, names and selection...')
        weather_dim_df = weather_dim_df.rename(columns=FINAL_NAMES_WEATHER)
        weather_dim_df = weather_dim_df[FINAL_COLS_WEATHER]
        sales_fact_df = sales_fact_df[FINAL_COLS_SALES]        
        
        logger.info(f'Packing data for loading...')
        list_of_transformed_dfs = [zipcode_dim_df,weather_dim_df, sales_fact_df]
        
        return list_of_transformed_dfs
    
    except Exception as e:
        logger.error(f'Transformation error: {e}', exc_info=True)
        raise



def create_table():
    connection = mysql.connector.connect(
        user = creds['mysql-db']['username'],
        password = creds['mysql-db']['password'],
        host = creds['mysql-db']['host'],
        database = creds['mysql-db']['database'],
    )
    cursor = connection.cursor()
    
    cursor.execute(QUERY_CREATE_ZIPCODE_DIM_TABLE)
    cursor.execute(QUERY_CREATE_WEATHER_DIM_TABLE)
    cursor.execute(QUERY_CREATE_SALES_FT_TABLE)
    connection.commit()
    print("Table structures created successfully.")
    
    cursor.close()
    connection.close()


dfs_dict = {
        "zipcode_dim": list_of_transformed_dfs[0],
        "weather_dim": list_of_transformed_dfs[1],
        "sales_fact": list_of_transformed_dfs[2]
    }


def write_to_database(dfs_dict, if_exists='append'):
    """
    Write a dataframe into a MySql table.

    Args:
        dfs_dict: The list of tables to load to along with the dfs to insert
        if_exists (str): Default 'append'
    """

    _db_user = creds['username']
    _db_password = creds['password']
    _db_host = creds['host']
    _db_name = creds['database']
    engine = create_engine(f"mysql+pymysql://{_db_user}:{_db_password}@{_db_host}:3306/{_db_name}")
    with engine.connect() as connection:
        for table_name, df in dfs_dict.items():
            if isinstance(df, pd.DataFrame): 
                df.to_sql(table_name, con=connection, if_exists=if_exists, index=False)
                print(f"Data successfully inserted into {table_name}")
            else:
                print(f"Skipping {table_name}: Not a valid DataFrame")
    
    # return logger.info("Completed uploading all data..")


create_table()
write_to_database(dfs_dict)

2025-03-08 07:00:40,878 - __main__ - INFO - sales_phases_funnel_df created
2025-03-08 07:00:40,887 - __main__ - INFO - zipcodedf created
2025-03-08 07:00:42,655 - __main__ - INFO - meteo_df created
2025-03-08 07:00:42,693 - __main__ - INFO - There are 0 duplicate rows in sales_funnel_df before duplicate cleaning
2025-03-08 07:00:42,698 - __main__ - INFO - There are 0 duplicate rows in zipcode_df before duplicate cleaning
2025-03-08 07:00:43,661 - __main__ - INFO - There are 0 duplicate rows in meteo_df before duplicate cleaning
2025-03-08 07:00:44,649 - __main__ - INFO - There are 0 duplicate rows in sales_funnel_df after duplicate cleaning
2025-03-08 07:00:44,651 - __main__ - INFO - There are 0 duplicate rows in zipcode_df after duplicate cleaning
2025-03-08 07:00:45,594 - __main__ - INFO - There are 0 duplicate rows in meteo_df after duplicate cleaning
2025-03-08 07:00:45,601 - __main__ - INFO - Unreachable leads deleted
2025-03-08 07:00:45,604 - __main__ - INFO - outliers_df datafra

NameError: name 'list_of_transformed_dfs' is not defined