In [2]:
# Import libraries
import pandas as pd
import numpy as np
import datetime as dt
import os
import math

Naming convention for dataframes:

* sd - Secchi disk depth

* t - turbidity

* tss - total suspended solids

* c - chlorophyll

* tcc - total cell count

    * tcc_c - total cell count (composite)
    
    * tcc_s - total cell count (surface)

* tb - total biovolume

    * tb_c - total biovolume (composite)
    
    * tb_s - total biovolume (surface)

* ccc - cyano cell count

    * ccc_c - cyano cell count (composite)
    
    * ccc_s - cyano cell count (surface)

* cb - cyano biovolume

    * cb_c - cyano biovolume (composite)
    
    * cb_s - cyano biovolume (surface)

In [3]:
# Locate folder with csv for each variable
data_folder = "../../data/processed/regression/"
files = os.listdir(data_folder)

In [4]:
# Check CSV files in the folder
files

['chlorophyll_final.csv',
 'cyano_biovolume_composite_final.csv',
 'cyano_biovolume_final.csv',
 'cyano_biovolume_surface_final.csv',
 'cyano_cell_count_composite_final.csv',
 'cyano_cell_count_final.csv',
 'cyano_cell_count_surface_final.csv',
 'secchi_depth_final.csv',
 'total_biovolume_composite_final.csv',
 'total_biovolume_final.csv',
 'total_biovolume_surface_final.csv',
 'total_cell_count_composite_final.csv',
 'total_cell_count_final.csv',
 'total_cell_count_surface_final.csv',
 'total_suspended_solids_final.csv',
 'turbidity_final.csv',
 'tweets_daily_final.csv']

In [5]:
# Create a dictionary to hold all the dataframes
df = {}

# Name of dataframe indices - align with the order of files
names = ["c", "cb_c", "cb", "cb_s", "ccc_c", "ccc", "ccc_s", "sd", "tb_c", 
         "tb", "tb_s", "tcc_c", "tcc", "tcc_s", "tss", "t", "tweets"]

In [6]:
# Import data
for name, file in zip(names, files):
        df[name] = pd.read_csv(f'{data_folder}{file}')

In [10]:
# Convert date column to datetime object
for file in df.keys():
    # seaprate if statement for tweets dataframe because its date column has a different name
    if file == "tweets":
        df[file]['time'] = pd.to_datetime(df[file]['time'])
    else:
        df[file]["OBJECTID_1"] = pd.to_datetime(df[file]["OBJECTID_1"])
        df[file].columns = df[file].columns.str.replace(".", "")
        df[file] = df[file]['X75']
    

In [13]:
df

{'c': 0      19.578767
 1      22.789905
 2       4.606094
 3      11.701630
 4      82.581816
 5      73.952105
 6      12.704681
 7      45.556070
 8       8.866609
 9      10.038378
 10     26.316612
 11    305.672119
 12     45.265713
 13    100.194262
 14     81.669596
 15     40.521667
 16     53.001339
 17     23.921853
 18     50.543127
 19     16.098199
 20     11.004787
 21     33.521324
 22      8.969109
 23     20.375231
 24      3.086706
 25     23.119865
 26     61.140789
 27      4.850000
 28     14.083227
 29     10.909076
 30     24.943311
 31    123.766667
 32     13.560000
 33     85.200000
 34    218.000000
 35     34.956667
 36     22.280000
 Name: X75, dtype: float64, 'cb_c': 0     1.802942e+05
 1     6.181867e+03
 2     5.925051e+05
 3     7.752958e+07
 4     2.397550e+06
 5     2.813747e+07
 6     7.661239e+06
 7     1.129411e+07
 8     1.555847e+08
 9     5.050182e+08
 10    1.958967e+07
 11    3.141613e+06
 12    1.140460e+07
 13    2.152113e+07
 14    1.14585

In [None]:
# number of observations, min, 2th percentile, median, 75th percentile, max, mean, range, variance

In [16]:
# Name of dataframe indices - align with the order of files
y_names = ['sd', 't', 'c', 'tss', 'tcc_c', 'tcc_s', 'tb_c', 'tb_s', 
         'ccc_c', 'ccc_s', 'cb_c', 'cb_s'] 

# Create a dataframe for descriptive statistics
desc_stats = pd.DataFrame(
                      index = ['Secchi disk depth', 'Turbidity', 'Chlorophyll a', 'Total suspended solids',
                      'Phytoplankton cell count (Composite)', 'Phytoplankton cell count (Surface)',
                      'Phytoplankton biovolume (Composite)', 'Phytoplankton biovolume (Surface)',
                      'Cyanobacteria cell count (Composite)', 'Cyanobacteria cell count (Surface)', 
                      'Cyanobacteria biovolume (Composite)', 'Cyanobacteria biovolume (Surface)'],
                      columns = ['Minimum', '25th Percentile', 'Median', '75th Percentile', 
                                'Maximum', 'Mean', 'Range', 'Variance']
                     )

In [30]:
# Iterate through each row
for i in range(0, len(desc_stats)):
    
    desc_stats.iloc[i,0] = min(df[y_names[i]])
    desc_stats.iloc[i,1] = np.percentile(df[y_names[i]], 25)
    desc_stats.iloc[i,2] = np.percentile(df[y_names[i]], 50) #np.median(df[y_names[i]])
    desc_stats.iloc[i,3] = np.percentile(df[y_names[i]], 75)
    desc_stats.iloc[i,4] = max(df[y_names[i]])
    desc_stats.iloc[i,5] = np.mean(df[y_names[i]])
    desc_stats.iloc[i,6] = max(df[y_names[i]]) - min(df[y_names[i]])
    desc_stats.iloc[i,7] = np.var(df[y_names[i]])
    

In [31]:
desc_stats

Unnamed: 0,Minimum,25th Percentile,Median,75th Percentile,Maximum,Mean,Range,Variance
Secchi disk depth,0.1,0.2,0.247276,0.305438,0.854711,0.277585,0.754711,0.0210185
Turbidity,6.4,35.7716,55.4755,79.2495,169.559,60.1588,163.159,1093.23
Chlorophyll a,3.08671,12.7047,23.9219,53.0013,305.672,47.2797,302.585,3588.36
Total suspended solids,7.0,35.1624,61.275,96.8559,334.364,74.0852,327.364,3317.69
Phytoplankton cell count (Composite),0.0,28883.1,61214.6,289725.0,45202300.0,1517860.0,45202300.0,42923500000000.0
Phytoplankton cell count (Surface),888.795,110795.0,1089090.0,3729990.0,49118200.0,4665590.0,49117400.0,87021800000000.0
Phytoplankton biovolume (Composite),899563.0,9200910.0,13758100.0,30327400.0,521405000.0,47555900.0,520505000.0,8451080000000000.0
Phytoplankton biovolume (Surface),12004400.0,112529000.0,1042060000.0,2513630000.0,9168870000.0,1998820000.0,9156860000.0,6.17888e+18
Cyanobacteria cell count (Composite),159.157,11035.8,46144.6,240951.0,45202300.0,1253990.0,45202200.0,34996900000000.0
Cyanobacteria cell count (Surface),41.8926,57899.9,726344.0,3852800.0,103588000.0,6035320.0,103588000.0,234554000000000.0


In [33]:
# Export
desc_stats.to_excel("../../output/descriptive stats/wq_descriptive_stats.xlsx")