In [33]:
## Insert Data Cleaning Part from Juanjo

#-----------FOR THE BOIS----------------------

#--------LIBRARIES FOR THE BOIS (THE ONES I USED, THE USUAL ONES)-----------

#COOL LIBRARIES
import pandas as pd
import numpy  as np
import os




#----------FUNCTIONS FOR THE BOIS-----------

#CREATE DATAFRAMES FUNCTION
#Creates 3 super cool dataframes from the CSVs with the data types set from the start.
def dataFrameCreate():
    global sales_phases_funnel_df, zipcode_df, meteo_df

    # SET PATHS OF 3 COOL CSVs
    FILENAME_sales_phases_funnel_df = os.path.join(os.getcwd(),'data', r'sale_phases_funnel.csv')
    FILENAME_zipcode_df = os.path.join(os.getcwd(),'data', r'zipcode_eae_v2.csv')
    FILENAME_meteo_df = os.path.join(os.getcwd(),'data', r'meteo_eae.csv')


    #SALES FUNNEL DATAFRAME

    #Dictionary with data types
    SALES_TYPES = {'LEAD_ID':'str','FINANCING_TYPE':'str',
                    'CURRENT_PHASE':'str','PHASE_PRE_KO':'str',
                    'IS_MODIFIED':'bool','ZIPCODE':'str', 
                    'VISITING_COMPANY': 'str', 'KO_REASON': 'str', 
                    'INSTALLATION_PEAK_POWER_KW': 'float64', 
                    'INSTALLATION_PRICE': 'float', 
                    'N_PANELS': 'int', 'CUSOMER_TYPE': 'str' }

    # Reading CSV to create dataframe with datatypes implemented from dictionary and additional date time datatypes.
    sales_phases_funnel_df = pd.read_csv(
        FILENAME_sales_phases_funnel_df, 
        delimiter=';', 
        dtype=SALES_TYPES,
        parse_dates=['OFFER_SENT_DATE', 'CONTRACT_1_DISPATCH_DATE', 
                    'CONTRACT_2_DISPATCH_DATE', 
                    'CONTRACT_1_SIGNATURE_DATE', 
                    'CONTRACT_2_SIGNATURE_DATE',
                    'VISIT_DATE',
                    'TECHNICAL_REVIEW_DATE',
                    'PROJECT_VALIDATION_DATE',
                    'SALE_DISMISSAL_DATE',
                    'KO_DATE'],
                    
        dayfirst=True  # This replaces the dayfirst=True in your to_datetime call
    )

    print('sales_phases_funnel_df created')



    #ZIPCODE DATAFRAME

    #Dictionary with data types
    ZIPCODE_TYPES = {'ZIPCODE':'str','ZC_LATITUDE':'float64',
                    'ZC_LONGITUDE':'float64','AUTONOMOUS_COMMUNITY':'str',
                    'AUTONOMOUS_COMMUNITY_NK':'str','PROVINCE':'str'}


    # Reading CSV to create dataframe with datatypes implemented from dictionary
    zipcode_df = pd.read_csv(FILENAME_zipcode_df, delimiter=',', dtype=ZIPCODE_TYPES)


    print('zipcodedf created')





    #METEO DATAFRAME

    #Dictionary with data types
    METEO_TYPES = {'temperature': 'float', 'relative_humidity': 'float', 
                'precipitation_rate': 'float', 'wind_speed': 'float', 
                'zipcode': 'str' 
    }

    # Reading CSV to create dataframe with datatypes implemented from dictionary and 
    # additional date time datatype formatted to match the ones from the sales dataframe.
    meteo_df = pd.read_csv(FILENAME_meteo_df, delimiter=';',
        dtype=METEO_TYPES, parse_dates=['date'],  # Replace with actual column name
        date_format='%Y/%m/%d %H:%M:%S.%f'  # This matches your input format
    )

    print('meteo_df created')
    



dataFrameCreate()




#--GLOBAL CLEANING FUNCTION--


#DROPPING DUPLICATES FOR ALL DATAFRAMES

#creating the drop duplicate function
def dropDupli():
    global sales_phases_funnel_df, zipcode_df, meteo_df
    print('There are ', sales_phases_funnel_df.duplicated().sum(), ' duplicate rows in sales_funnel_df before duplicate cleaning') 
    print('There are ', zipcode_df.duplicated().sum(), ' duplicate rows in zipcode_df before duplicate cleaning')
    print('There are ', meteo_df.duplicated().sum(), ' duplicate rows in meteo_df before duplicate cleaning') 
    sales_phases_funnel_df.drop_duplicates(inplace=True)
    zipcode_df.drop_duplicates(inplace=True)
    meteo_df.drop_duplicates(inplace=True)
    print('There are ', sales_phases_funnel_df.duplicated().sum(), ' duplicate rows in sales_funnel_df after duplicate cleaning') 
    print('There are ', zipcode_df.duplicated().sum(), ' duplicate rows in zipcode_df after duplicate cleaning')
    print('There are ', meteo_df.duplicated().sum(), ' duplicate rows in meteo_df after duplicate cleaning')

dropDupli()





# --SALES FUNNEL DATAFRAME CLEANING FUCTIONS--


#DELETE UNUSABLE LEADS FUNCTION 

# Drop rows where KO_REASON is "Unreachable"
def delete_unreachable_leads():
    global sales_phases_funnel_df
    sales_phases_funnel_df = sales_phases_funnel_df[~((sales_phases_funnel_df['CURRENT_PHASE'] == 'KO') & (sales_phases_funnel_df['KO_REASON'] == 'Unreachable'))]
    # Reset the index of the updated DataFrame
    sales_phases_funnel_df.reset_index(drop=True, inplace=True)

# Verify the changes
#print(sales_phases_funnel_df.isnull().sum())
#print(sales_phases_funnel_df.head())

delete_unreachable_leads()




# REMOVE OUTLIERS FUNCTION


def delete_outliers():
    global sales_phases_funnel_df, outliers_df
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = sales_phases_funnel_df['INSTALLATION_PRICE'].quantile(0.25)
    Q3 = sales_phases_funnel_df['INSTALLATION_PRICE'].quantile(0.75)

    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Identify outliers inside a new Data Frame
    outliers_df = sales_phases_funnel_df[(sales_phases_funnel_df['INSTALLATION_PRICE'] < (Q1 - 1.5 * IQR)) | 
                                     (sales_phases_funnel_df['INSTALLATION_PRICE'] > (Q3 + 1.5 * IQR))]

    # Print the number of outliers
    print('outliers_df created')
    print(f'Number of outliers: {len(outliers_df)}')
    
    # Update sales_phases_funnel_df to exclude the outliers
    sales_phases_funnel_df = sales_phases_funnel_df[~((sales_phases_funnel_df['INSTALLATION_PRICE'] < (Q1 - 1.5 * IQR)) | 
                                                  (sales_phases_funnel_df['INSTALLATION_PRICE'] > (Q3 + 1.5 * IQR)))]
    print('outliers removed from sales_phases_funnel_df')
    


delete_outliers()






#-------------------------DATAFRAMES FOR THE BOIS-------


#This is the final list of dataframes
list_of_dfs = [sales_phases_funnel_df, zipcode_df, meteo_df, outliers_df]

sales_phases_funnel_df created
zipcodedf created
meteo_df created
There are  0  duplicate rows in sales_funnel_df before duplicate cleaning
There are  0  duplicate rows in zipcode_df before duplicate cleaning
There are  0  duplicate rows in meteo_df before duplicate cleaning
There are  0  duplicate rows in sales_funnel_df after duplicate cleaning
There are  0  duplicate rows in zipcode_df after duplicate cleaning
There are  0  duplicate rows in meteo_df after duplicate cleaning
outliers_df created
Number of outliers: 1096
outliers removed from sales_phases_funnel_df


Max Code

In [56]:
## -------Transformations---------- ##

# Rename Data Frames

sales_fact_df=list_of_dfs[0].copy()
zipcode_dim_df=list_of_dfs[1].copy()
weather_dim_df=list_of_dfs[2].copy()

# All column names in lower case letters

sales_fact_df.columns = sales_fact_df.columns.str.lower()
zipcode_dim_df.columns = zipcode_dim_df.columns.str.lower()
weather_dim_df.columns = weather_dim_df.columns.str.lower()



In [57]:
# Create zipcode_id in zipcode_dim_df

zipcode_dim_df.insert(0,"zipcode_id",range(1, len(zipcode_dim_df) + 1))
zipcode_dim_df["zipcode_id"] = zipcode_dim_df["zipcode_id"].astype("int32")


In [58]:
weather_dim_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4114206 entries, 0 to 4114205
Data columns (total 6 columns):
 #   Column              Dtype         
---  ------              -----         
 0   date                datetime64[ns]
 1   temperature         float64       
 2   relative_humidity   float64       
 3   precipitation_rate  float64       
 4   wind_speed          float64       
 5   zipcode             object        
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 188.3+ MB


In [59]:
weather_dim_df['date'] = weather_dim_df['date'].dt.year


In [60]:
weather_dim_df = weather_dim_df.groupby(['date','zipcode']).mean().reset_index()

In [61]:
weather_dim_df

Unnamed: 0,date,zipcode,temperature,relative_humidity,precipitation_rate,wind_speed
0,2024,00005,16.174498,67.347143,0.000022,2.008925
1,2024,00040,11.475667,65.450714,0.000020,3.008473
2,2024,00041,19.238992,59.822617,0.000018,3.540069
3,2024,00042,11.367275,67.177426,0.000019,3.396050
4,2024,00043,16.512345,61.318704,0.000015,4.992865
...,...,...,...,...,...,...
11236,2024,80154,18.515036,61.136868,0.000014,3.134549
11237,2024,80338,17.454745,66.066340,0.000027,2.781973
11238,2024,90007,16.341548,59.216941,0.000015,5.131758
11239,2024,91319,13.748843,80.145519,0.000035,2.436620


In [62]:
# Merge zipcode_dim and weather_dim

weather_dim_df = pd.merge(weather_dim_df,zipcode_dim_df,on= "zipcode", how="left")
weather_dim_df

Unnamed: 0,date,zipcode,temperature,relative_humidity,precipitation_rate,wind_speed,zipcode_id,zc_latitude,zc_longitude,autonomous_community,autonomous_community_nk,province
0,2024,00005,16.174498,67.347143,0.000022,2.008925,,,,,,
1,2024,00040,11.475667,65.450714,0.000020,3.008473,,,,,,
2,2024,00041,19.238992,59.822617,0.000018,3.540069,,,,,,
3,2024,00042,11.367275,67.177426,0.000019,3.396050,,,,,,
4,2024,00043,16.512345,61.318704,0.000015,4.992865,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
11236,2024,80154,18.515036,61.136868,0.000014,3.134549,,,,,,
11237,2024,80338,17.454745,66.066340,0.000027,2.781973,,,,,,
11238,2024,90007,16.341548,59.216941,0.000015,5.131758,,,,,,
11239,2024,91319,13.748843,80.145519,0.000035,2.436620,,,,,,


In [63]:
# removing temps on missing zipcodes

weather_dim_df = weather_dim_df.dropna()

In [64]:
weather_dim_df

Unnamed: 0,date,zipcode,temperature,relative_humidity,precipitation_rate,wind_speed,zipcode_id,zc_latitude,zc_longitude,autonomous_community,autonomous_community_nk,province
14,2024,01001,11.562535,78.391780,0.000033,2.933424,1.0,42.8500,-2.6667,Pais Vasco,PV,Álava
15,2024,01002,11.562535,78.391780,0.000033,2.933424,1000.0,42.8500,-2.6667,Pais Vasco,PV,Álava
16,2024,01003,11.562535,78.391780,0.000033,2.933424,999.0,42.8500,-2.6667,Pais Vasco,PV,Álava
17,2024,01004,11.562535,78.391780,0.000033,2.933424,998.0,42.8500,-2.6667,Pais Vasco,PV,Álava
18,2024,01005,11.562535,78.391780,0.000033,2.933424,997.0,42.8435,-2.6748,Pais Vasco,PV,Álava
...,...,...,...,...,...,...,...,...,...,...,...,...
11216,2024,50810,15.899365,60.551156,0.000017,4.743960,11024.0,41.7958,-0.8000,Aragon,AR,Zaragoza
11217,2024,50820,16.341548,59.216941,0.000015,5.131758,11023.0,41.7167,-0.8333,Aragon,AR,Zaragoza
11218,2024,50830,16.304705,59.403024,0.000015,4.889607,11022.0,41.7667,-0.8167,Aragon,AR,Zaragoza
11219,2024,50840,16.036900,60.103512,0.000016,4.762858,11021.0,41.8333,-0.7667,Aragon,AR,Zaragoza


In [65]:
# Merge zipcode_dim and sales_fact

sales_fact_df = pd.merge(sales_fact_df, zipcode_dim_df, on="zipcode", how="left")
sales_fact_df

Unnamed: 0,lead_id,financing_type,current_phase,phase_pre_ko,is_modified,offer_sent_date,contract_1_dispatch_date,contract_2_dispatch_date,contract_1_signature_date,contract_2_signature_date,...,installation_peak_power_kw,installation_price,n_panels,cusomer_type,zipcode_id,zc_latitude,zc_longitude,autonomous_community,autonomous_community_nk,province
0,C8877823,cash,Validated project,Validated project,False,2024-01-01,2024-01-03,NaT,2024-01-09,NaT,...,4.800,9562.77,12,Individual household,,,,,,
1,C2068654,cash,KO,Initial Offer,False,2024-01-01,NaT,NaT,NaT,NaT,...,1.600,4650.92,4,Individual household,6673.0,37.0111,-4.0333,Andalucia,AN,Málaga
2,C1925058,cash,KO,Commercial visit,False,2024-01-01,NaT,NaT,NaT,NaT,...,3.600,7438.55,9,Community of owners,4678.0,37.7667,-3.7833,Andalucia,AN,Jaén
3,C3357155,cash,KO,Initial Offer,False,2024-01-01,NaT,NaT,NaT,NaT,...,3.200,6580.33,8,SME,3828.0,41.9912,2.8216,Cataluna,CT,Girona
4,C5104785,cash,KO,Initial Offer,False,2024-01-01,NaT,NaT,NaT,NaT,...,2.800,5937.57,7,Individual household,10747.0,39.4333,-0.4333,Comunidad Valenciana,VC,Valencia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27763,C2986619,cash,KO,Winback,False,2024-12-30,NaT,NaT,NaT,NaT,...,1.515,5944.00,3,Individual household,9302.0,41.1200,0.9367,Cataluna,CT,Tarragona
27764,C1071478,cash,KO,Initial Offer,False,2024-12-30,NaT,NaT,NaT,NaT,...,3.535,8522.83,7,SME,,,,,,
27765,C8350262,financed,Validated project,Validated project,False,2024-12-30,2025-01-08,2025-01-10,2025-01-08,2025-01-10,...,6.060,11457.01,12,Individual household,10423.0,41.3500,-4.5833,Castilla - Leon,CL,Valladolid
27766,C5247567,cash,KO,Initial Offer,False,2024-12-30,NaT,NaT,NaT,NaT,...,2.020,6743.50,4,SME,3489.0,37.1667,-3.9667,Andalucia,AN,Granada


In [66]:
# Add weather_id in weather_dim

weather_dim_df.insert(0,"weather_id",range(1, len(weather_dim_df) + 1))

In [67]:
# Add sales_id to sales_fact

sales_fact_df.insert(0,"sales_id",range(1, len(sales_fact_df) + 1))

In [76]:
# Add calculated column to sales_fact_df
sales_fact_df.insert(16,"most_recent_contract_signature",sales_fact_df[["contract_1_signature_date", "contract_2_signature_date"]].max(axis=1))

In [77]:
sales_fact_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27768 entries, 0 to 27767
Data columns (total 30 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   sales_id                        27768 non-null  int64         
 1   lead_id                         27768 non-null  object        
 2   financing_type                  27768 non-null  object        
 3   current_phase                   27768 non-null  object        
 4   phase_pre_ko                    27768 non-null  object        
 5   is_modified                     27768 non-null  bool          
 6   offer_sent_date                 27768 non-null  datetime64[ns]
 7   contract_1_dispatch_date        9736 non-null   datetime64[ns]
 8   contract_2_dispatch_date        943 non-null    datetime64[ns]
 9   contract_1_signature_date       2081 non-null   datetime64[ns]
 10  contract_2_signature_date       878 non-null    datetime64[ns]
 11  vi

In [68]:
# Final Arrangement of columns for sales_fact_df

FINAL_COLS_SALES = ["sales_id","zipcode_id","lead_id","financing_type","current_phase","phase_pre_ko",
              "is_modified","offer_sent_date","contract_1_dispatch_date","contract_2_dispatch_date","contract_1_signature_date",
              "contract_2_signature_date","most_recent_contract_signature","visit_date","technical_review_date",
              "project_validation_date","sale_dismissal_date","ko_date","visiting_company","ko_reason",
              "installation_peak_power_kw","installation_price","n_panels","cusomer_type"]

In [69]:
# Change date to year, rename columns in and final arrangement of columns in weather_dim_df


weather_dim_df= weather_dim_df.rename(columns={"date":"year",'temperature': 'avg_temperature', 'relative_humidity': 'avg_relative_humidity',
                                               "precipitation_rate":"avg_precipitation_rate","wind_speed":"avg_wind_speed"})



FINAL_COLS_WEATHER=["weather_id","zipcode_id","year","avg_temperature","avg_relative_humidity","avg_precipitation_rate",
                    "avg_wind_speed"]



In [70]:
zipcode_dim_df

Unnamed: 0,zipcode_id,zipcode,zc_latitude,zc_longitude,autonomous_community,autonomous_community_nk,province
0,1,01001,42.8500,-2.6667,Pais Vasco,PV,Álava
1,2,07119,39.6833,2.7000,Baleares,IB,Illes Balears
2,3,07110,39.6833,2.7000,Baleares,IB,Illes Balears
3,4,07109,39.7833,2.7333,Baleares,IB,Illes Balears
4,5,07108,39.8000,2.6833,Baleares,IB,Illes Balears
...,...,...,...,...,...,...,...
11402,11403,49543,41.6667,-6.0333,Castilla - Leon,CL,Zamora
11403,11404,49542,41.7167,-6.1500,Castilla - Leon,CL,Zamora
11404,11405,49541,41.6833,-6.0833,Castilla - Leon,CL,Zamora
11405,11406,49540,41.7500,-5.9833,Castilla - Leon,CL,Zamora


In [71]:
weather_dim_df=weather_dim_df[FINAL_COLS_WEATHER]

weather_dim_df

Unnamed: 0,weather_id,zipcode_id,year,avg_temperature,avg_relative_humidity,avg_precipitation_rate,avg_wind_speed
14,1,1.0,2024,11.562535,78.391780,0.000033,2.933424
15,2,1000.0,2024,11.562535,78.391780,0.000033,2.933424
16,3,999.0,2024,11.562535,78.391780,0.000033,2.933424
17,4,998.0,2024,11.562535,78.391780,0.000033,2.933424
18,5,997.0,2024,11.562535,78.391780,0.000033,2.933424
...,...,...,...,...,...,...,...
11216,10465,11024.0,2024,15.899365,60.551156,0.000017,4.743960
11217,10466,11023.0,2024,16.341548,59.216941,0.000015,5.131758
11218,10467,11022.0,2024,16.304705,59.403024,0.000015,4.889607
11219,10468,11021.0,2024,16.036900,60.103512,0.000016,4.762858


In [78]:
sales_fact_df=sales_fact_df[FINAL_COLS_SALES]
sales_fact_df

Unnamed: 0,sales_id,zipcode_id,lead_id,financing_type,current_phase,phase_pre_ko,is_modified,offer_sent_date,contract_1_dispatch_date,contract_2_dispatch_date,...,technical_review_date,project_validation_date,sale_dismissal_date,ko_date,visiting_company,ko_reason,installation_peak_power_kw,installation_price,n_panels,cusomer_type
0,1,,C8877823,cash,Validated project,Validated project,False,2024-01-01,2024-01-03,NaT,...,2024-01-10,2024-02-22,NaT,NaT,Internal,,4.800,9562.77,12,Individual household
1,2,6673.0,C2068654,cash,KO,Initial Offer,False,2024-01-01,NaT,NaT,...,NaT,NaT,NaT,2024-01-02,Internal,Useless Contact,1.600,4650.92,4,Individual household
2,3,4678.0,C1925058,cash,KO,Commercial visit,False,2024-01-01,NaT,NaT,...,NaT,NaT,NaT,2024-01-03,Internal,Product. Vertical Building / Condominiums,3.600,7438.55,9,Community of owners
3,4,3828.0,C3357155,cash,KO,Initial Offer,False,2024-01-01,NaT,NaT,...,NaT,NaT,NaT,2024-01-04,Internal,Useless Contact,3.200,6580.33,8,SME
4,5,10747.0,C5104785,cash,KO,Initial Offer,False,2024-01-01,NaT,NaT,...,NaT,NaT,NaT,2024-01-04,Internal,Useless Contact,2.800,5937.57,7,Individual household
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27763,27764,9302.0,C2986619,cash,KO,Winback,False,2024-12-30,NaT,NaT,...,NaT,NaT,NaT,2025-03-20,Internal,Useless Contact. Not Interested,1.515,5944.00,3,Individual household
27764,27765,,C1071478,cash,KO,Initial Offer,False,2024-12-30,NaT,NaT,...,NaT,NaT,NaT,2025-01-08,Internal,Useless Contact. Not Interested,3.535,8522.83,7,SME
27765,27766,10423.0,C8350262,financed,Validated project,Validated project,False,2024-12-30,2025-01-08,2025-01-10,...,2025-01-22,2025-01-22,NaT,NaT,Internal,,6.060,11457.01,12,Individual household
27766,27767,3489.0,C5247567,cash,KO,Initial Offer,False,2024-12-30,NaT,NaT,...,NaT,NaT,NaT,2025-01-02,Internal,Useless Contact. Not Interested,2.020,6743.50,4,SME


In [None]:
# To Do:

# Group weather table by zipcode id 
# Create zipcode id in zipcode_dim
# Merge zipcode and weather 
# merge zipcode and fact
# add weather id in weather_dim done 
# add sales id to sales done
# columns

