In [None]:
# Import necessary libraries
import os
import sys

# Add the src directory to the path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
# Import custom functions
from src.data_quality_check import data_quality_check, summarize_quality_issues
from src.data_processing import clean_data, data_loader

In [6]:
# Load datasets
togo_df = data_loader('../data/raw/togo-dapaong_qc.csv')
sierraleone_df = data_loader('../data/raw/sierraleone-bumbuna.csv')
benin_df = data_loader('../data/raw/benin-malanville.csv')

In [7]:
# Define columns for quality check
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

In [8]:
# Perform quality checks on each dataset
togo_issues = data_quality_check(togo_df, columns_to_check)
sierraleone_issues = data_quality_check(sierraleone_df, columns_to_check)
benin_issues = data_quality_check(benin_df, columns_to_check)

In [9]:
# Print the results
print("TOGO Dataset:")
summarize_quality_issues(togo_issues)

TOGO Dataset:
Data Quality Check Summary:
Column: GHI
  Missing Values: 0
  Negative Values: 257385
  Outliers: 1303
----------------------------------------
Column: DNI
  Missing Values: 0
  Negative Values: 0
  Outliers: 50586
----------------------------------------
Column: DHI
  Missing Values: 0
  Negative Values: 0
  Outliers: 7905
----------------------------------------
Column: ModA
  Missing Values: 0
  Negative Values: 0
  Outliers: 1237
----------------------------------------
Column: ModB
  Missing Values: 0
  Negative Values: 0
  Outliers: 1537
----------------------------------------
Column: WS
  Missing Values: 0
  Negative Values: 0
  Outliers: 8708
----------------------------------------
Column: WSgust
  Missing Values: 0
  Negative Values: 0
  Outliers: 7377
----------------------------------------


In [10]:
print("\nSIERRA LEONE Dataset:")
summarize_quality_issues(sierraleone_issues)


SIERRA LEONE Dataset:
Data Quality Check Summary:
Column: GHI
  Missing Values: 0
  Negative Values: 261135
  Outliers: 15853
----------------------------------------
Column: DNI
  Missing Values: 0
  Negative Values: 266352
  Outliers: 101688
----------------------------------------
Column: DHI
  Missing Values: 0
  Negative Values: 263128
  Outliers: 4407
----------------------------------------
Column: ModA
  Missing Values: 0
  Negative Values: 0
  Outliers: 21017
----------------------------------------
Column: ModB
  Missing Values: 0
  Negative Values: 0
  Outliers: 20613
----------------------------------------
Column: WS
  Missing Values: 0
  Negative Values: 0
  Outliers: 3169
----------------------------------------
Column: WSgust
  Missing Values: 0
  Negative Values: 0
  Outliers: 3665
----------------------------------------


In [11]:
print("\nBENIN Dataset:")
summarize_quality_issues(benin_issues)



BENIN Dataset:
Data Quality Check Summary:
Column: GHI
  Missing Values: 0
  Negative Values: 258847
  Outliers: 145
----------------------------------------
Column: DNI
  Missing Values: 0
  Negative Values: 275987
  Outliers: 15196
----------------------------------------
Column: DHI
  Missing Values: 0
  Negative Values: 259182
  Outliers: 8733
----------------------------------------
Column: ModA
  Missing Values: 0
  Negative Values: 0
  Outliers: 98
----------------------------------------
Column: ModB
  Missing Values: 0
  Negative Values: 0
  Outliers: 240
----------------------------------------
Column: WS
  Missing Values: 0
  Negative Values: 0
  Outliers: 6717
----------------------------------------
Column: WSgust
  Missing Values: 0
  Negative Values: 0
  Outliers: 5368
----------------------------------------


In [17]:
# Clean the datasets
togo_df = clean_data(togo_df)
sierraleone_df = clean_data(sierraleone_df)
benin_df = clean_data(benin_df)

In [None]:
# Save cleaned data
togo_df.to_csv('../data/processed/cleaned_togo.csv', index=False)

In [None]:
# Save cleaned data
sierraleone_df.to_csv("../data/processed/cleaned_sierraleone.csv", index=False)

In [None]:
# Save cleaned data
benin_df.to_csv("../data/processed/cleaned_benin.csv", index=False)