# Exploratory data analysis for sierraleone-bumbuna

## Summary Statistics

### Reading data

In [15]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/sierraleone-bumbuna.csv')

### Inspect data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525600 entries, 0 to 525599
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Timestamp      525600 non-null  object 
 1   GHI            525600 non-null  float64
 2   DNI            525600 non-null  float64
 3   DHI            525600 non-null  float64
 4   ModA           525600 non-null  float64
 5   ModB           525600 non-null  float64
 6   Tamb           525600 non-null  float64
 7   RH             525600 non-null  float64
 8   WS             525600 non-null  float64
 9   WSgust         525600 non-null  float64
 10  WSstdev        525600 non-null  float64
 11  WD             525600 non-null  float64
 12  WDstdev        525600 non-null  float64
 13  BP             525600 non-null  int64  
 14  Cleaning       525600 non-null  int64  
 15  Precipitation  525600 non-null  float64
 16  TModA          525600 non-null  float64
 17  TModB          525600 non-nul

In [4]:
df.shape

(525600, 19)

In [5]:
df.head()   # Displays the first 5 rows starting from 0

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,2021-10-30 00:01,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.1,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,
1,2021-10-30 00:02,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.2,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,
2,2021-10-30 00:03,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.2,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,
3,2021-10-30 00:04,-0.7,0.0,-0.8,0.0,0.0,21.9,99.3,0.0,0.0,0.0,0.0,0.0,1002,0,0.1,22.3,22.6,
4,2021-10-30 00:05,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.3,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,


In [6]:
summary = df.describe()

In [7]:
summary

Unnamed: 0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
count,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,0.0
mean,201.957515,116.376337,113.720571,206.643095,198.114691,26.319394,79.448857,1.146113,1.691606,0.363823,133.044668,7.17222,999.876469,0.000967,0.004806,32.504263,32.593091,
std,298.49515,218.652659,158.946032,300.896893,288.889073,4.398605,20.520775,1.239248,1.617053,0.295,114.284792,7.535093,2.104419,0.031074,0.047556,12.434899,12.009161,
min,-19.5,-7.8,-17.9,0.0,0.0,12.3,9.9,0.0,0.0,0.0,0.0,0.0,993.0,0.0,0.0,10.7,11.1,
25%,-2.8,-0.3,-3.8,0.0,0.0,23.1,68.7,0.0,0.0,0.0,0.0,0.0,999.0,0.0,0.0,23.5,23.8,
50%,0.3,-0.1,-0.1,3.6,3.4,25.3,85.4,0.8,1.6,0.4,161.5,6.2,1000.0,0.0,0.0,26.6,26.9,
75%,362.4,107.0,224.7,359.5,345.4,29.4,96.7,2.0,2.6,0.6,234.1,12.0,1001.0,0.0,0.0,40.9,41.3,
max,1499.0,946.0,892.0,1507.0,1473.0,39.9,100.0,19.2,23.9,4.1,360.0,98.4,1006.0,1.0,2.4,72.8,70.4,


## Data Quality Check

* Check for missing values:

In [8]:
missing_values = df.isnull().sum()

In [9]:
missing_values

Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64

* Check for Incorrect Entries

In [10]:
invalid_entries = df[(df['GHI'] < 0) | (df['DNI'] < 0) | (df['DHI'] < 0)]
print(f"Invalid Entries Found:\n{invalid_entries}")


Invalid Entries Found:
               Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb     RH   WS  WSgust  \
0       2021-10-30 00:01 -0.7 -0.1 -0.8   0.0   0.0  21.9   99.1  0.0     0.0   
1       2021-10-30 00:02 -0.7 -0.1 -0.8   0.0   0.0  21.9   99.2  0.0     0.0   
2       2021-10-30 00:03 -0.7 -0.1 -0.8   0.0   0.0  21.9   99.2  0.0     0.0   
3       2021-10-30 00:04 -0.7  0.0 -0.8   0.0   0.0  21.9   99.3  0.0     0.0   
4       2021-10-30 00:05 -0.7 -0.1 -0.8   0.0   0.0  21.9   99.3  0.0     0.0   
...                  ...  ...  ...  ...   ...   ...   ...    ...  ...     ...   
525595  2022-10-29 23:56 -1.6 -0.1 -2.9   0.0   0.0  24.0  100.0  0.0     0.0   
525596  2022-10-29 23:57 -1.7 -0.1 -3.0   0.0   0.0  24.0  100.0  0.0     0.0   
525597  2022-10-29 23:58 -1.7 -0.1 -3.1   0.0   0.0  24.0  100.0  0.0     0.0   
525598  2022-10-29 23:59 -1.7 -0.2 -3.3   0.0   0.0  23.9  100.0  0.0     0.0   
525599  2022-10-30 00:00 -1.7 -0.1 -3.4   0.0   0.0  23.9  100.0  0.0     0.0   

    

* Invalid Ranges: For columns like wind direction (WD), ensure values are within the valid range (0–360 degrees):

In [11]:
invalid_wd = df[(df['WD'] < 0) | (df['WD'] > 360)]
print(f"Invalid Wind Direction Entries:\n{invalid_wd}")



Invalid Wind Direction Entries:
Empty DataFrame
Columns: [Timestamp, GHI, DNI, DHI, ModA, ModB, Tamb, RH, WS, WSgust, WSstdev, WD, WDstdev, BP, Cleaning, Precipitation, TModA, TModB, Comments]
Index: []


* Check Autliers

In [16]:
def detect_outliers_zscore(series, threshold=3):
  """
  Detects outliers in a pandas Series using the Z-score method.

  Args:
    series: The pandas Series containing the data.
    threshold: The z-score threshold for identifying outliers (default: 3).

  Returns:
    A boolean Series indicating whether each data point is an outlier.
  """
  z_scores = np.abs((series - series.mean()) / series.std())
  return z_scores > threshold

In [17]:
columns_to_check = ['ModA', 'ModB', 'WS', 'WSgust']

for col in columns_to_check:
  # Detect outliers using Z-score
  outliers_zscore = detect_outliers_zscore(df[col]) 
  print(f"Outliers in {col} (Z-score):")
  print(df[outliers_zscore])

Outliers in ModA (Z-score):
               Timestamp     GHI    DNI    DHI    ModA    ModB  Tamb    RH  \
662     2021-10-30 11:03  1071.0  616.0  539.1  1142.0  1116.0  26.4  82.5   
663     2021-10-30 11:04  1119.0  673.5  538.4  1193.0  1167.0  26.3  83.1   
670     2021-10-30 11:11  1092.0  667.1  507.6  1163.0  1137.0  27.0  83.4   
671     2021-10-30 11:12  1154.0  755.7  491.3  1237.0  1208.0  27.0  82.4   
672     2021-10-30 11:13  1063.0  665.2  478.0  1132.0  1105.0  27.1  80.1   
...                  ...     ...    ...    ...     ...     ...   ...   ...   
517751  2022-10-24 13:12  1091.0  657.7  471.9  1123.0  1098.0  30.2  75.8   
517808  2022-10-24 14:09  1085.0  737.7  447.3  1114.0  1092.0  30.4  74.1   
517811  2022-10-24 14:12  1087.0  745.7  445.6  1112.0  1089.0  30.7  73.8   
517823  2022-10-24 14:24  1109.0  762.0  470.3  1130.0  1109.0  30.6  73.1   
520614  2022-10-26 12:55  1068.0  731.3  377.6  1124.0  1100.0  31.4  70.2   

         WS  WSgust  WSstdev     WD