This code is for us to explore the data, perform some EDA to understand data's patters, trends and poential issues.

In [1]:
#Importing lib's
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [2]:
#Input and Output paths
input_file = r"S:\Sem_4\Output\cleaned_limnology_data.xlsx"
output_dir = r"S:\Sem_4\Output\eda_plots"
summary_file = r"S:\Sem_4\Output\limnology_eda_summary.xlsx"

#Creating a directory for plots, if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [3]:
#Loading the cleaned Limnology data
df = pd.read_excel(input_file)

In [4]:
#Summary statics
summary_stats = df.describe(include='all')
print("Summary stats:")
print(summary_stats)

Summary stats:
              date            lake_name   site_name    latitude   longitude  \
count          328                  328         328  307.000000  310.000000   
unique         124                   47          82         NaN         NaN   
top     2024-10-10  kashagawigamog lake  STOC-WQ-01         NaN         NaN   
freq             8                   23          10         NaN         NaN   
mean           NaN                  NaN         NaN   45.074833  -75.432422   
std            NaN                  NaN         NaN    0.162100   21.686530   
min            NaN                  NaN         NaN   44.012210  -78.805910   
25%            NaN                  NaN         NaN   44.992145  -78.664856   
50%            NaN                  NaN         NaN   45.063610  -78.435995   
75%            NaN                  NaN         NaN   45.165075  -78.346655   
max            NaN                  NaN         NaN   45.704940   78.806530   

        lake_depth_(in_metres)    ai

In [5]:
#Detecting outliers using IQR method for numeric columns
outliers_report = []
for col in df.select_dtypes(include=["float64","int64"]).columns:
    if col not in ['latitude', 'longitude']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][["lake_name", col]]
        if not outliers.empty:
            outliers_report.append({
                "Column": col,
                "Outlier_Count": len(outliers),
                "Outlier_Details": outliers.to_dict()
            })
            print(f"\nOutlier in {col}:")
            print(outliers)


Outlier in lake_depth_(in_metres):
            lake_name  lake_depth_(in_metres)
116  little hawk lake                    88.0
122  little hawk lake                    88.7
132        halls lake                    72.0
134  little hawk lake                    86.4
136        halls lake                    72.1
139        halls lake                    75.0

Outlier in air_temp:
                lake_name  air_temp
4                bob lake      -2.0
19             cedar lake     -15.0
30            spruce lake      -2.0
44              drag lake     -12.0
45            spruce lake       4.0
59            glamor lake     -18.0
64         gooderham lake     -18.0
72           growler lake     -14.0
80              gull lake      -9.0
100       haliburton lake      34.4
102       haliburton lake      35.9
108       haliburton lake      -5.0
149            allen lake     -18.0
150  little straggle lake      -8.0
151     big straggle lake      -8.0
188   kashagawigamog lake      -9.0
205     

In [6]:
#Histograms for numeric columns

#This is a function to clean the column names as they contain special characters which cannot be used for a file name
def clean_col_name(col):
    return re.sub(r'/', '_', col)

#Create and save all the histogram in the eda_plot folder
for col in df.select_dtypes(include=["float64","int64"]).columns:
    if col not in ['latitude', 'longitude']:
        plt.figure(figsize=(8,8))
        sns.histplot(df[col], bins=30, kde=True)
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.savefig(os.path.join(output_dir, f"{clean_col_name(col)}_histogram.png"))
        plt.close()

In [7]:
#Create and save all the box plots in the eda_plot folder
for col in df.select_dtypes(include=["float64", "int64"]).columns:
    if col != "lake_name":
        plt.figure(figsize=(12, 6))
        sns.boxplot(x="lake_name", y=col, data=df)
        plt.title(f"{col} by Lake")
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{clean_col_name(col)}_boxplot_by_lake.png"))
        plt.close()

In [8]:
#Save summary statistics to Excel
with pd.ExcelWriter(summary_file) as writer:
    summary_stats.to_excel(writer, sheet_name="Summary_Stats")
    if outliers_report:
        outliers_df = pd.DataFrame([
            {"Column": r["Column"], "Outlier_Count": r["Outlier_Count"]}
            for r in outliers_report
        ])
        outliers_df.to_excel(writer, sheet_name="Outliers_Summary", index=False)

print(f"EDA summary saved as {summary_file}")
print(f"Plots saved in {output_dir}")

EDA summary saved as S:\Sem_4\Output\limnology_eda_summary.xlsx
Plots saved in S:\Sem_4\Output\eda_plots
