# First Name: Soumyadeep
# Last Name: Sarkar

# Import Libraries  

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi 
import matplotlib.pyplot as plt

# Import Data

In [None]:
bottle = pd.read_csv('bottle.csv', low_memory=False)
pd.set_option('display.float_format', lambda x:'%f'%x)
bottle.head(10);

# Data management 

In [None]:
# The two variables used will be depth and phosphate concentration.
bottle["Depthm"] = pd.to_numeric(bottle["Depthm"])
bottle["PO4uM"] = pd.to_numeric(bottle["PO4uM"])

In [None]:
# Initial histogram of depth.
sns.distplot(bottle["Depthm"], kde=False);

In [None]:
# Histogram of phosphate concentration.
sns.distplot(bottle["PO4uM"], kde=False);

In [None]:
# Obtain a subset limited to 1000m as there are very few values for depth beyond this point.
sub1 = bottle[(bottle["Depthm"] < 1000)].copy()

In [None]:
# Distplot after limiting.
sns.distplot(sub1["Depthm"], kde=False);

In [None]:
# Split depth into 3 categories, shallow, middle, and deep.
sub1["DEPTH_CAT"] = pd.cut(sub1["Depthm"], [0, 200, 400, 600, 1000], labels=["shallow", "kinda shallow", "kinda deep", "deep"])
sub1["DEPTH_CAT"].value_counts(sort=False)

In [None]:
# Create a new subset containing only the required variables and drop nan values.
sub2 = sub1[["PO4uM", "DEPTH_CAT"]].dropna()

# Box Plot

In [None]:
# Plot phosphate concentration vs water depth to visualize relationship.
# The graph shows an increasing relationship between depth and phosphate concentration.
%matplotlib inline

sns.boxplot(x="DEPTH_CAT", y="PO4uM", data=sub2)
plt.xlabel("DEPTH_CAT")
plt.ylabel("PO4uM")
plt.title("Box plot of water depth vs phosphate concentration");

# ANOVA

In [None]:
# Perform ANOVA variance test between water depth and phosphate concentration.
# R-squared: 0.65
# F-statistic: 2.411e+05
# p-value: 0.0
# p-value is < 0.05, so we reject the null hypothesis.
model1 = smf.ols(formula='PO4uM ~ C(DEPTH_CAT)', data=sub2).fit()
print (model1.summary())

In [None]:
print ('means for PO4uM by depth category:')
m2 = sub2.groupby('DEPTH_CAT').mean()
print (m2)

In [None]:
print ('means for PO4uM by depth category:')
m2 = sub2.groupby('DEPTH_CAT').std()
print (m2)

In [None]:
# Perform Tukey post hoc test to analyze differences between categories.
# New p-value: 0.05 / 6 = 0.0083
mc1 = multi.MultiComparison(sub2['PO4uM'], sub2['DEPTH_CAT'])
res1 = mc1.tukeyhsd()
print(res1.summary())