# First Name: Soumyadeep 
# Last Name: Sarkar

# Import Libraries  

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import Data

In [None]:
# Read dataset.
bottle = pd.read_csv("bottle.csv", low_memory=False) # Read data from the bottle.csv file.
pd.set_option('display.float_format', lambda x:'%f'%x)
len(bottle) # Find the size of the csv file.

# Data management 

In [None]:
# Convert variables to numeric.
bottle["Depthm"] = pd.to_numeric(bottle["Depthm"], errors="coerce") # Convert the data from string to numeric data type.
bottle["PO4uM"] = pd.to_numeric(bottle["PO4uM"], errors="coerce")
bottle["SiO3uM"] = pd.to_numeric(bottle["SiO3uM"], errors="coerce")
bottle["T_degC"] = pd.to_numeric(bottle["T_degC"], errors="coerce")
bottle["NO2uM"] = pd.to_numeric(bottle["NO2uM"], errors="coerce")
bottle["NO3uM"] = pd.to_numeric(bottle["NO3uM"], errors="coerce")
bottle["NH3uM"] = pd.to_numeric(bottle["NH3uM"], errors="coerce")
bottle["STheta"] = pd.to_numeric(bottle["STheta"], errors="coerce")
bottle["PO4uM"].describe()

In [None]:
# Replace nan values with 0.
bottle["SiO3uM"] = bottle["SiO3uM"].fillna(0) 
bottle["Oxy_µmol/Kg"] = bottle["Oxy_µmol/Kg"].fillna(0)
bottle["NO2uM"] = bottle["NO2uM"].fillna(0)
bottle["NO3uM"] = bottle["NO3uM"].fillna(0)
bottle['NH3uM'] = bottle['NH3uM'].fillna(0)
bottle["STheta"] = bottle["STheta"].fillna(0)

# Box plot

In [None]:
# Obtain a subset of the data such that Si03uM is recorded and Depthm <= 1000 (no recordings for Si03uM past this depth).
sub1 = bottle[(bottle['SiO3uM'] != 0) & (bottle["Depthm"] <= 1000)].copy()

In [None]:
# Create a new variable, DEPTHCATEGORY, by cutting Depthm into 5 categories.
sub1["DEPTHCATEGORY"] = pd.cut(sub1["Depthm"], [0, 200, 400, 600, 800, 1000])
sub1["DEPTHCATEGORY"] = sub1["DEPTHCATEGORY"].astype("category")
sub1["DEPTHCATEGORY"].value_counts()

In [None]:
%matplotlib inline
# Box Plot.
sns.boxplot(x='DEPTHCATEGORY', y='SiO3uM', data=sub1)
plt.xlabel("Depth (m)") # Indicating the x-axis of the graph.
plt.ylabel("Silicate Concentration (µmol/L)") # Indicating the y-axis of the graph.
plt.title("Silicate Concentration of Water at Different Depths") # Naming the graph.

# Histogram 

In [None]:
# Obtain a subset of the data such that depth <= 50m
sub6 = bottle[(bottle["Depthm"] <= 50)].copy()

In [None]:
# Describe variable T_degC.
bottle["T_degC"].describe()

In [None]:
%matplotlib inline
# Histogram.
plt.figure(figsize=(10, 10)) # Defining the size of the plot.
sns.histplot(sub6["T_degC"].dropna(), kde=False)
plt.xlabel("Temperature in Degrees C") # Naming the x-axis of the plot.
plt.title("Temperature of Water for Depth up to 50m");

# Line chart 

In [None]:
# Obtain a subset of the data such that Oxy_µmol/Kg is recorded.
sub2 = bottle[(bottle["Oxy_µmol/Kg"] != 0) & (bottle["Depthm"] <= 500)].copy()
sub2["Oxy_µmol/Kg"].value_counts(sort=False, dropna=False)

In [None]:
# Group Oxy_µmol/Kg by water depth.
var1 = sub2.groupby(["Depthm"])["Oxy_µmol/Kg"].mean()
var1

In [None]:
%matplotlib inline
# Line Chart.
plt.figure(figsize=(10, 10))
var1.plot(kind="line");
plt.xlabel("Depth in m") # Naming the x-axis.
plt.ylabel("Oxygen Contentration in µmol/Kg") # Naming the y-axis.
plt.title("Oxygen Concentration Recorded at Different Depths of Water") # Name the graph.

# Bubble chart 

In [None]:
# Obtain a subset of the data such that NO2uM, NO3uM, and NH3uM are recorded.
# Outliers are removed by capping NO2uM at 1.5.
sub3 = bottle[(bottle["NO2uM"] < 1.5) & (bottle["NO3uM"] != 0) & (bottle["NH3uM"] != 0) & (bottle["Depthm"] <= 50)]

In [None]:
%matplotlib inline
# Bubble Chart.
plt.figure(figsize=(10, 10))
plt.scatter(sub3["NO2uM"], sub3["NO3uM"], s=sub3["NH3uM"])
plt.xlabel("Nitrite Concentration (µmol/Kg)") # Name the x-axis.
plt.ylabel("Nirtate Concentration (µmol/Kg)") # Name the y-axis.
plt.title("Comparison of Nitrite, Nitrate, and Ammonium Concentration \nfor Depth up to 50m (Bubble size == Ammonium)"); # Naming the graph.

# Select Plot

In [None]:
# Obtain a subset of the data such that Po4uM is recorded.
sub4 = bottle[(bottle["PO4uM"] != 0) & (bottle["Depthm"] <= 250)].copy()

In [None]:
# Create a new variable, DEPTHCATEGORY, by cutting Depthm into 5 categories.
sub4["DEPTHCATEGORY"] = pd.cut(sub4["Depthm"], [0, 50, 100, 150, 200, 250])
sub4["DEPTHCATEGORY"] = sub4["DEPTHCATEGORY"].astype("category")
sub4["DEPTHCATEGORY"].value_counts()

In [None]:
# Split phosphate concentration readings into whether or not they were greater than mean value.
def greater_than_mean(mean, row):
    if row["PO4uM"] > mean:
        return 1
    else:
        return 0
phosphate_mean = sub4["PO4uM"].mean() # Find the mean value.
sub4["POSPHATEGREATERTHANMEAN"] = sub4.apply(lambda row: greater_than_mean(phosphate_mean, row), axis=1)
sub4["POSPHATEGREATERTHANMEAN"].value_counts() 

In [None]:
# Create a new DataFrame containing only the required variables.
sub5 = sub4[["DEPTHCATEGORY", "POSPHATEGREATERTHANMEAN"]].copy()

In [None]:
# Create pivot table.
table = pd.pivot_table(sub5, index=["DEPTHCATEGORY"], columns=["POSPHATEGREATERTHANMEAN"], aggfunc=np.size)
print(table)

In [None]:
%matplotlib inline
# HeatMap.
fig = plt.figure(figsize=(10, 10))
sns.heatmap(table, annot=True, fmt="d")
plt.xlabel("Whether or not greater than mean") # Name the x-axis.
plt.ylabel("Depth") # Name the y-axis.
plt.title("Counts of How Many Phosphate Concentration Readings Were \nGreater Than Mean Value for Water Depths up to 250m."); # Name the graph.