# First Name: Soumyadeep
# Last Name: Sarkar

# Import Libraries  

In [None]:
import pandas as pd
import numpy as np
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt

# Import Data

In [None]:
bottle = pd.read_csv('bottle.csv', low_memory=False)
pd.set_option('display.float_format', lambda x:'%f'%x)

# Data management 

In [None]:
# The two variables to be used are the temperature and salinity of the water.
bottle["T_degC"] = pd.to_numeric(bottle["T_degC"], errors="coerce")
bottle["Salnty"] = pd.to_numeric(bottle["Salnty"], errors="coerce")

In [None]:
# Plot histogram for water temperature.
print(bottle["T_degC"].describe())
sns.distplot(bottle["T_degC"].dropna(), kde=False);

In [None]:
# Plot histogram for salinity.
print(bottle["Salnty"].describe())
sns.distplot(bottle["Salnty"].dropna(), kde=False);

In [None]:
# Obtain a subset of the data for water temperatures up to 5 degrees.
# We are only interested in the relationship to salinity for very cold water.
sub1 = bottle[(bottle["T_degC"] < 5)].copy()
sns.distplot(sub1["T_degC"].dropna(), kde=False);

In [None]:
# Obtain another subset containing only the required variables and with no null values.
sub2 = sub1[["T_degC", "Salnty"]].dropna()

In [None]:
%matplotlib inline
# Plot water temperature vs salinity.
# The graph shows a decreasing relationship between these two variables.
sns.lineplot(x="T_degC", y="Salnty", data=sub2)
plt.xlabel("Temperature")
plt.ylabel("Salinity")
plt.title("Temperature of Water vs Salt Content");

In [None]:
# Cut water temperature into 4 categories with labels from 1 to 4.
# As there are no temperature values < 1, the 0-1 and 1-2 bins are combined.
sub2["TEMP_CAT"] = pd.cut(sub2["T_degC"], [0, 2, 3, 4, 5], labels=[1, 2, 3, 4])
sub2["TEMP_CAT"].value_counts(sort=False, dropna=False)

In [None]:
# Cut water salinity into 2 categories and digitize.
sub2["SAL_CAT"] = np.digitize(sub2["Salnty"], bins=[34.5])
sub2["SAL_CAT"].value_counts(sort=False, dropna=False)

# Chart/Plot

In [None]:
%matplotlib inline
# Visualize relationship between temperature and salinity with a box plot.
# The box plot shows that as temperature increases, the percentage of salinity values above 34.5 decreases.
sns.factorplot(x="TEMP_CAT", y="SAL_CAT", data=sub2, kind="bar", ci=None);

# CHI_Squared

In [None]:
# Create cross-table between salinity and temperature.
ct1 = pd.crosstab(sub2["SAL_CAT"], sub2["TEMP_CAT"])
print(ct1)

In [None]:
# Convert values to percentages.
colsum=ct1.sum(axis=0)
colpct=ct1/colsum
print(colpct)

In [None]:
# Perform chi-squared contingency test.
# Chi-squared value: 15988.2675
# p-value: 0.0
# p-value is < 0.05, reject null hypothesis.
print('chi-square value, p value, expected counts')
cs1 = scipy.stats.chi2_contingency(ct1)
print(cs1)

In [None]:
# Perform post-hoc analysis between different temperature categories.
# All p-values are < 0.0083 (0.05 / 6), therefore there is a significant
# difference in salinity readings above 34.5 between all temperature categories.
sub3 = sub2.copy()
cat = [1, 2, 3, 4]

for x in range(0,len(cat)-1):
    for y in range(x+1,len(cat)):
        recode = { cat[x]:cat[x], cat[y]:cat[y]}
        sub3['temp'] = sub3['TEMP_CAT'].map(recode)
        cont=pd.crosstab(sub3['SAL_CAT'], sub3['temp'])
        cs= scipy.stats.chi2_contingency(cont)
        print("\n", cat[x], " versus ", cat[y],
              "Chi value: ", cs[0], "\t\tp value: ", cs[1])
