In [None]:
# Imports

# Add necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
# Data retrieval

# Read data from the csv file
df = pd.read_csv('bop_data2.csv.txt')

# Turbidity and Dissolved Oxygen Levels
tbd = df['turb_NTU']
o2 = df['DO_mgL']

# 9 total sites
sites = ['Bush Terminal Park', 'Brooklyn Bridge Park', 'Coney Island Creek', 'Lemon Creek Lagoon', 
         'Paerdegat Basin', "Governor's Island", 'Lemon Creek Nursery', 'Great Kills Harbor', 'Brooklyn Navy Yard']

In [None]:
# Data processing and graphing

# Create two lists of all compiled data
tbd_total, o2_total = sumArr(tbd, o2, sites, False)

# Create a final plot of the data collected from all sites
linear_Regression(tbd_total, o2_total, 'Turbidity vs Dissolved Oxygen (All Sites)')

In [None]:
# Functions

# Check if an entry is not empty/NaN
def checkNotNaN(val):
  return val == val

# Create a simple list of entries of a particular site name
def findSite(arr, name):
  n = len(df['Site'])
  array = []
  for i in range (0, n):
    if df['Site'][i] == name and checkNotNaN(arr[i]):
      array.append(arr[i])
  return array

# Create a simple plot with given parameters
def graph(x, y, x_axis, y_axis, name, bool):
  plt.plot(x, y, 'ro')
  plt.xlabel(x_axis)
  plt.ylabel(y_axis)
  plt.title(name)
  if bool:
    plt.show()

# Sum up the lists for each site and graph if necessary
def sumArr(tbd, o2, sites, bool):
  tbd_total, o2_total = [], []
  for i in range(0, len(sites)):
    tbd_site = findSite(tbd, sites[i])
    o2_site = findSite(o2, sites[i])
    tbd_total += tbd_site
    o2_total += o2_site
    if bool:
      linear_Regression(tbd_site, o2_site, sites[i])
  return tbd_total, o2_total

# Use linear regression to create a graph
def linear_Regression(tbd_total, o2_total, name):
  model = LinearRegression(normalize = True)
  tbd_np = np.array(tbd_total)
  o2_np = np.array(o2_total)
  tbd_train, tbd_test, o2_train, o2_test = train_test_split(tbd_np, o2_np, test_size = 0.3, random_state = 101)
  tbd_train = tbd_train.reshape(-1, 1)
  o2_train = o2_train.reshape(-1, 1)
  tbd_test = tbd_test.reshape(-1, 1)
  model.fit(tbd_train, o2_train)
  o2_predict = model.predict(tbd_test)
  graph(tbd_test, o2_test, 'Turbidity (NTU)', 'Dissolved Oxygen (mg/L)', name, False)
  plt.plot(tbd_test, o2_predict, color = 'blue', linewidth = 2)
  plt.show()