In [1]:
#Import the relevant Python modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [2]:
#Import the relevant data
Salmon = pd.read_excel("Data/ZooPlanktonPerryData.xlsx", 5)
oldSSTs = pd.read_csv("Data/SSTs")
Salinity = pd.read_csv("Data/Entrance_Island_-_Average_Monthly_Sea_Surface_Salinities_1936-2023.csv")
Catches = pd.read_excel("Data/Copy of recreational reported fishery catch, 1953-2012.xlsx")
SeaLevel = pd.read_csv("Data/SeaLevel_Cherry_annualMean.txt", sep=";")
xl_file = pd.ExcelFile('Data/ZooPlanktonPerryData.xlsx')


In [3]:
Salmon = Salmon.rename(columns={"Ocean Entry Year":"Year"})

#For each year I want a list with the SSTs for the months in ascending order. 
SSTsList = []
for year in oldSSTs["Year"].unique():
    SSTsList.append([year] + oldSSTs[oldSSTs["Year"] == year]["SST"].tolist())

SSTs = pd.DataFrame(SSTsList, columns=["Year", "SST Jan.", "SST Feb.", "SST Mar.", "SST Apr.", "SST May", "SST Jun.", "SST Jul.", "SST Aug", "SST Sep.", "SST Oct.", "SST Nov.", "SST Dec."])

#Extract the zoo plankton anomalies from the excel sheet
Original_Salmon_Sheets = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

anom = Original_Salmon_Sheets['3. Zooplankton anomalies']
anom = anom[6:]
zoos = anom[anom.columns[3:]]
zoos["Year"] = anom["Year"].astype(int)

#Grab the relevant years from Salinity
Salinity = Salinity[55:] 
Salinity = Salinity.rename(columns={"ENTRANCE ISLAND LIGHTSTATION: AVERAGE MONTHLY SEA SURFACE SALINITIES (PSU)":"Year"})
Salinity = Salinity['Year'].astype(int)

#Next is Catches
ModernCatches = Catches[Catches["YEAR"] >= 1990]
CohoCatches = ModernCatches[ModernCatches["SPECIES_DESC"] == "COHO SALMON"]
ChinookCatches = ModernCatches[ModernCatches["SPECIES_DESC"] == "CHINOOK SALMON"]
cohoAnnualCatchList = []
chinookAnnualCatchList = []
yearList = []
for year in CohoCatches["YEAR"].unique():
    cohoAnnualCatchList.append(CohoCatches[CohoCatches["YEAR"] == year]["PIECES"].sum())
    chinookAnnualCatchList.append(ChinookCatches[ChinookCatches["YEAR"] == year]["PIECES"].sum())
    yearList.append(year)
Catches = pd.DataFrame({"Coho": cohoAnnualCatchList, "Chinook": chinookAnnualCatchList, "Year": yearList})
#Sealevel
SeaLevel = SeaLevel.drop(['Y', '000'], axis=1)
SeaLevel= SeaLevel.rename(columns={' 1985': "Year",'  6945': "Sea Level"}, errors="raise")
SeaLevel = SeaLevel[4:]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zoos["Year"] = anom["Year"].astype(int)


In [4]:
#Combine the relevant data into a single data frame
newFrame1 = pd.merge(Salmon, SSTs, on="Year", how="outer")
newFrame2 = pd.merge(newFrame1, zoos, on="Year", how="outer")
newFrame3 = pd.merge(newFrame2, Salinity, on="Year", how="outer")
newFrame4 = pd.merge(newFrame3, Catches, on="Year", how="outer")
AllData = pd.merge(newFrame4, SeaLevel, on="Year", how="outer")
AllData[["Cowichan Chinook", "Harrison Chinook", "Puntledge Chinook"]] = AllData[["Cowichan Chinook", "Harrison Chinook", "Puntledge Chinook"]].fillna(0)
#Drop the years we have no viability data for
AllData = AllData.drop([28, 29, 30, 31, 32, 33])
AllDataNoNull = AllData.dropna()

In [5]:
AllDataNoNull

Unnamed: 0,Year,Cowichan Chinook,Harrison Chinook,Puntledge Chinook,Big Qualicum Coho,SST Jan.,SST Feb.,SST Mar.,SST Apr.,SST May,...,Natantia,NonCalCops,Ostracoda,PolychaetPelagic,Pteropods,Repantia,Siphonophorae,Coho,Chinook,Sea Level
6,1996,0.008811,0.010038,0.0,0.014246,10.860983,12.025331,11.728321,11.119489,9.9483,...,0.102015,0.24233,0.297214,0.255998,0.019095,-0.604888,0.391865,217199.0,114942.0,6988.0
7,1997,0.008294,0.023246,0.004286,0.004293,11.910243,12.317101,12.166986,11.192175,10.135995,...,0.163304,0.248062,0.278687,-0.063418,-0.024925,0.211126,0.247333,213163.0,154268.0,7081.0
8,1998,0.010999,0.008297,0.017847,0.013055,12.17785,12.481543,12.182488,11.857747,10.362404,...,0.159563,0.238723,0.374939,0.089059,0.701012,0.770283,0.089499,16335.0,160780.0,7065.0
9,1999,0.012922,0.00915,0.010559,0.012809,10.745245,11.234034,11.984316,11.041389,9.501446,...,-0.270029,0.370312,0.158727,-0.22534,1.165355,-1.34075,0.33314,47758.0,185093.0,7008.0
10,2000,0.009417,0.020958,0.004856,0.01229,11.760833,12.119185,11.596674,10.351773,9.346248,...,0.320934,0.229351,0.202458,0.399923,0.690691,0.541262,0.352759,36707.0,138164.0,6982.0
11,2001,0.00467,0.01435,0.006632,0.01049,11.766296,12.703541,12.857022,12.055732,11.124836,...,0.416704,-0.346593,0.014837,0.771314,0.049555,-0.267489,-0.139811,213302.0,218707.0,6965.0
12,2002,0.005766,0.024074,0.00532,0.007603,11.647489,12.359512,12.708898,12.127488,11.316425,...,-0.092913,-0.302982,0.023336,-0.015653,-1.322044,0.707707,-1.126729,183885.0,350788.0,6990.0
13,2003,0.003321,0.008604,0.009984,0.014171,12.302134,12.725002,12.487629,11.188088,10.113362,...,-1.347999,-0.119151,-0.015787,0.311688,-0.056666,-0.777562,-0.536512,345421.0,494653.0,7034.0
14,2004,0.00463,0.013713,0.009279,0.001227,11.854964,12.150301,11.676899,10.889526,9.744645,...,-0.172336,0.027338,0.018535,0.339549,-0.675904,-0.018153,0.276182,299931.0,606127.0,7024.0
15,2005,0.0,0.0,0.003218,0.000853,11.416472,12.057631,12.157208,11.479593,10.933879,...,-0.05762,-0.239464,0.066092,0.160682,0.07272,-1.031958,-0.17875,283672.0,541544.0,7034.0


In [19]:
#Fit our lasso models
features = {}

for fishName in ["Cowichan Chinook", "Harrison Chinook", "Puntledge Chinook", "Big Qualicum Coho"]:
    from sklearn.linear_model import Lasso
    y = AllDataNoNull[fishName]
    x = AllDataNoNull.loc[:, AllDataNoNull.columns != fishName]
    ## This code will allow us to demonstrate the effect of 
    ## increasing alpha

    ## set values for alpha
    alphas = [0.00001,0.0001,0.001, .0025, .005,0.01, .05 ,0.1,1,10,100,1000]

    ## The degree of the polynomial we will fit
    n=40

    #$ These will hold our coefficient estimates
    lasso_coefs = np.empty((len(alphas),n))

    ## for each alpha value
    for i in range(len(alphas)):
        ## set up the ridge pipeline
        ## first scale   
        ## set up the lasso pipeline
        ## same steps as with ridge
        lasso_pipe = Pipeline([('scale',StandardScaler()),
                                ('lasso',Lasso(alpha=alphas[i], max_iter=5000000))])
        
        
        ## fit the lasso
        lasso_pipe.fit(x, y)

        # record the coefficients
        lasso_coefs[i,:] = lasso_pipe['lasso'].coef_

    res = pd.DataFrame(np.round(lasso_coefs,8),
                columns = x.columns,
                index = ["alpha=" + str(a) for a in alphas])
    
    for j in range(len(alphas)):
        if len(res.iloc[j][res.iloc[j] != 0]) <= 5:
            featureList = list(res.iloc[j][res.iloc[j] != 0].index)
            break
    
    features[fishName] = featureList

In [21]:
#This is a dictionary of the most important features for each species. 
features

{'Cowichan Chinook': ['Big Qualicum Coho', 'TotBiom', 'CalCops.smal'],
 'Harrison Chinook': ['SST Feb.',
  'SST Mar.',
  'SST Nov.',
  'Ctenophora',
  'Fish'],
 'Puntledge Chinook': ['SST Oct.',
  'AmphiGam',
  'BenthicLarv',
  'Larvacea',
  'Sea Level'],
 'Big Qualicum Coho': ['Cowichan Chinook',
  'TotBiom',
  'CalCops.larg',
  'Natantia']}