In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from scipy import special
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

sns.set_style("whitegrid")

# Years

In [2]:
start_date = 1990
end_date = 2017

# Target Data Reading 

## Salmon viability

In [3]:
#Import the relevant data
Salmon_file = pd.read_excel("Data/ZooPlanktonPerryData.xlsx", 5)

#Take years less than or equal to 2017  (year 2018 has only NaN)

Salmon = Salmon_file[Salmon_file["Ocean Entry Year"]<=2017]
Salmon = Salmon.rename(columns={"Ocean Entry Year":"Year"})

In [4]:
# List of salmon types 
year_column = 'Year'
salmonTypes_List =  [col for col in Salmon.columns if col != year_column]

print("Salmon types are:", salmonTypes_List)

Salmon types are: ['Cowichan Chinook', 'Harrison Chinook', 'Puntledge Chinook', 'Big Qualicum Coho']


# Data Reading

## Abiotic Factors

### SST

In [5]:
oldSSTs = pd.read_csv("Data/SSTs")
#For each year I want a list with the SSTs for the months in ascending order. 
SSTsList = []
for year in oldSSTs["Year"].unique():
    SSTsList.append([year] + oldSSTs[oldSSTs["Year"] == year]["SST"].tolist())

SSTs = pd.DataFrame(SSTsList, columns=["Year", "SST Jan.", "SST Feb.", "SST Mar.", "SST Apr.", "SST May", "SST Jun.", "SST Jul.", "SST Aug", "SST Sep.", "SST Oct.", "SST Nov.", "SST Dec."])
SSTs = SSTs[:28]

### Sea Level

In [6]:
## Read the Annual Mean Sea Levels
        #there are data of four ports
#Take the years between start_date and end_date
#Take the positive values (non existent data entered as -99999
#drop the columns 2 and 3 to keep year and sea level measurement
#rename the column that contains Sea Level measurement


file_path="Data/SeaLevel_Point_Atkinson193.txt"
df=pd.read_csv(file_path, header=None,delimiter=';')
df= df[(df[0] >= start_date) & (df[0] <= end_date) & (df[1]>=0)].drop(columns=[2,3])
seaLevelPointA_clean =df.rename(columns={0: 'Year', 1: 'Sea Level Point Atkinson'})
#missing year 1997

file_path="Data/SeaLevel_Port_Angeles2127.txt"
df =pd.read_csv(file_path, header=None,delimiter=';')
df= df[(df[0] >= start_date) & (df[0] <= end_date) & (df[1]>=0)].drop(columns=[2,3])
seaLevelPortA_clean= df.rename(columns={0: 'Year', 1: 'Sea Level Port Angeles'})
#no missing year

file_path="Data/SeaLevel_Campbell_River1323.txt"
df=pd.read_csv(file_path, header=None,delimiter=';')
df= df[(df[0] >= start_date) & (df[0] <= end_date) & (df[1]>=0)].drop(columns=[2,3])
seaLevelCampbellR_clean= df.rename(columns={0: 'Year', 1: 'Sea Level Campbell River'})
#missing year 1995 & 1996

file_path="Data/SeaLevel_Cherry_annualMean.txt"
df= pd.read_csv(file_path, header=None,delimiter=';')
df= df[(df[0] >= start_date) & (df[0] <= end_date) & (df[1]>=0)].drop(columns=[2,3])
seaLevelCherryP_clean= df.rename(columns={0: 'Year', 1: 'Sea Level Cherry'})
#missing year 1994


#Merge the sea Levels
#Calculate their average 
    #missing data is ignored 
        #note: if a year is missing it's only missing at most one of the ports

merged_df= pd.merge(seaLevelPortA_clean, seaLevelPointA_clean, on='Year', how='outer')
merged_df = pd.merge(merged_df, seaLevelCampbellR_clean, on='Year', how='outer')
seaLevels = pd.merge(merged_df, seaLevelCherryP_clean, on='Year', how='outer')

#Average sea levels of four ports
seaLevels['av_SeaLevels']= seaLevels.iloc[:,1:].mean(axis=1, skipna=True)
#has few NaN values, fill them with the average
seaLevels.iloc[7,2] = seaLevels.iloc[7,5]
seaLevels.iloc[5,3] = seaLevels.iloc[5,5] 
seaLevels.iloc[6,3] = seaLevels.iloc[6,5] 
seaLevels.iloc[4,4] = seaLevels.iloc[4,5] 

### Salinity

In [7]:
Salinity_file = pd.read_excel('Data/Entrance_island_salinity.xlsx') 

#Sea salinity is provided monthly, we will calculate yearly average. Also, nonexistent data is marked by 999.99

df_salinity = Salinity_file.replace(999.99,np.nan)
df_salinity_yr = df_salinity['YEAR']
df_salinity_val = df_salinity.drop(['YEAR'],axis=1).mean(axis = 1)
df_sea_salinity = pd.DataFrame({'Year':df_salinity_yr,'Avg Sea Salinity':df_salinity_val})
df_sea_salinity = df_sea_salinity.loc[(df_sea_salinity['Year'] >= start_date) & (df_sea_salinity['Year'] <= end_date)]
df_sea_salinity.reset_index(drop = True, inplace = True)

### NPGO climate index  

In [8]:
# import a climatological index
NPGO_monthly = pd.read_csv('Data/NPGOMonthlyAvg copy.txt')

# get the yearly NPGO averages and clean it up
NPGO_yearly = NPGO_monthly.groupby('Year').mean()
NPGO_yearly = NPGO_yearly.drop(columns=['Month']).reset_index()
NPGO_yearly = NPGO_yearly.loc[(NPGO_yearly['Year']>=start_date) & (NPGO_yearly['Year']<= end_date)]
NPGO_yearly['Year'] = NPGO_yearly['Year'].astype(int)
NPGO=  NPGO_yearly.reset_index().drop(columns=['index'])

### PDO

In [9]:
#read Pacific decadal oscillation(PDO) data
Pdo_file = pd.read_excel('Data/pdo.xlsx','pdo')  


df_pdo = Pdo_file.loc[(Pdo_file['year'] >= start_date) & (Pdo_file['year'] <= end_date)]
df_pdo_clean = df_pdo.rename(columns={'year':'Year'})
df_pdo_avg = df_pdo_clean.groupby(['Year']).mean()
df_pdo_avg = df_pdo_avg.drop(columns = ['date','month'])



### Flow Rate

In [10]:
#read Fraser river flow data
Flow_rate_file = pd.read_excel('Data/Fraser_flow_rate.xlsx','Flow rate')  

df_flow_rate = Flow_rate_file[(Flow_rate_file['Year'] >= start_date) & (Flow_rate_file['Year'] <= end_date) &(Flow_rate_file['PARAM'] == 1)]
df_flow_rate = df_flow_rate.drop(columns = [' ID','PARAM','MM--DD','SYM','MM--DD.1','SYM.1'])
df_flow_rate['Avg flow rate'] = 0.5*(df_flow_rate['MAX']+df_flow_rate['MIN'])
df_flow_rate.rename(columns = {'MAX':'Max flow rate','MIN':'Min flow rate'},inplace = True)
df_flow_rate.reset_index(drop = True, inplace = True)

In [11]:
#Merge Environmental Factors
    #Salinity, PDO, Fraser River Flow Rate, NPGO, sea levels, SSTs
env_factors = pd.merge(df_sea_salinity,df_pdo_avg,on = 'Year',how = 'outer')
env_factors = pd.merge(env_factors, df_flow_rate, on = 'Year', how = 'outer')
env_factors = pd.merge(env_factors, NPGO, on = 'Year', how = 'outer')
env_factors = pd.merge(env_factors, seaLevels, on = 'Year', how = 'outer')
env_factors = pd.merge(env_factors, SSTs, on = 'Year', how = 'outer')

## Biotic Factors

### Predator Populations 


In [12]:
#(4) read Killer Whale data
Whale_file = pd.read_excel('Data/killerwhales_1970to2020.xlsx','killerwhales_1970to2020') 

#(5) read harbour seal population data
Seal_file = pd.read_excel('Data/harbourseals_1970to2020.xlsx','harbourseals_1970to2020')  


df_whale = Whale_file.loc[(Whale_file['Year'] >= start_date) & (Whale_file['Year'] <= end_date)].drop(columns = ['SRKW','srkwSource','nrkwSource'])
df_seal = Seal_file.loc[(Seal_file['year'] >= start_date) & (Seal_file['year'] <= end_date)].drop(columns = ['nonSogNt','bcNt']).rename(columns={'year':'Year'})


df_whale.reset_index(drop = True, inplace = True)
df_seal.reset_index(drop = True, inplace = True)

#Merge df_whale and df_seal
df_pred_factors = pd.merge(df_seal,df_whale, on = 'Year',how = 'outer')
df_pred_factors.rename(columns = {'sogNt':'Harbour seal','NRKW':'Killer whale'},inplace = True)


### Zooplankton

In [13]:
#Read ZooPlanktonPerryData file

xl_file = pd.ExcelFile('Data/ZooPlanktonPerryData.xlsx')

Original_Salmon_Sheets = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

# Biomass Definitions #No Data here. In case info on column names is needed
selected_sheet = "2. Biomass data definitions"  
BiomassDefs = Original_Salmon_Sheets[selected_sheet]


#Extract the zoo plankton anomalies from the excel sheet
anom = Original_Salmon_Sheets['3. Zooplankton anomalies']
anom = anom[6:28] #Drops years before <=1995 (All Nan values) and years after >=2018
zoos = anom.drop(columns=anom.columns[1:3]).reset_index(drop=True)

In [14]:
#obtain  zooplank biomasses
##take the years less than end_date 
##we drop columns that do not give biomass of zooplanktons
selected_sheet = "1. Zooplankton Biomass data"  
df = Original_Salmon_Sheets[selected_sheet]
df= df[df["yr"]<=end_date]
plankBiomass_data=df.drop(columns=['key', 'survey', 'region', 'event', 'net', 'station', 'lon', 'lat', 'mon', 'day', 'time',  'twilight', 'net.type', 'diam.m', 'mesh.um', 'startz.m', 'endz.m','botz.m',
       'volfilt.m3'])

# Create a new DataFrame for averaged values
#Group by 'yr' and calculate the annual mean of Zooplanktons
#drop duplicates
#rename the columns for convenienve
#reset_index

plankBiomassMean = pd.DataFrame()
for column in plankBiomass_data.columns:
    if column in ['yr', 'region']:
        plankBiomassMean[column] = plankBiomass_data[column]
    else:
        name = 'Av_' + column
        plankBiomassMean[name] = plankBiomass_data.groupby('yr')[column].transform('mean')

plankBiomassMean.drop_duplicates(subset=['yr'], inplace=True)
plankBiomassMean.rename(columns={'yr': 'Year', 'Av_Total.Biomass': 'Av_totalBiomass'},  inplace=True)
plankBiomassMean.reset_index(drop=True, inplace=True)




In [15]:
plankton_anm_mass = pd.merge(zoos, plankBiomassMean, on = 'Year',how = 'outer')

In [16]:
biotic_factors = pd.merge(df_pred_factors, plankton_anm_mass, on = 'Year', how = 'outer')


## Anthropogenic Factors

### Catches

In [17]:
#Import the relevant data
Catches = pd.read_excel("Data/Copy of recreational reported fishery catch, 1953-2012.xlsx")

#Next is Catches
ModernCatches = Catches[Catches["YEAR"] >= start_date]
CohoCatches = ModernCatches[ModernCatches["SPECIES_DESC"] == "COHO SALMON"]
ChinookCatches = ModernCatches[ModernCatches["SPECIES_DESC"] == "CHINOOK SALMON"]
cohoAnnualCatchList = []
chinookAnnualCatchList = []
yearList = []
for year in CohoCatches["YEAR"].unique():
    cohoAnnualCatchList.append(CohoCatches[CohoCatches["YEAR"] == year]["PIECES"].sum())
    chinookAnnualCatchList.append(ChinookCatches[ChinookCatches["YEAR"] == year]["PIECES"].sum())
    yearList.append(year)
Catches = pd.DataFrame({"Coho Catch": cohoAnnualCatchList, "Chinook Catch": chinookAnnualCatchList, "Year": yearList})



### BC and WA regional populations and population trends

In [18]:
# import the population data for BC Regional Districts
BC_pop = pd.read_csv('Data/BCPop1990.csv')#, index_col=0) # only need index_col=0 to make years the indices
BC_pop = BC_pop.rename(columns={'Unnamed: 0':'Year'}).drop_duplicates() # there's a duplicate row at 2011!

# pre-2001 BC population numbers are only for every five years
# make an empty table for these years, to be interpolated
missing_yrs = np.setdiff1d([i for i in range(1990,2001)],[1991,1996])
missing_yrs_vals = np.empty((29,9,))*np.nan
missing_data = np.vstack((missing_yrs,missing_yrs_vals)).transpose()
BC_missing = pd.DataFrame(missing_data)
BC_missing.columns = BC_pop.columns
BC_pop = pd.concat([BC_missing,BC_pop]).sort_values(by='Year')

# fill in the missing years with some linearly interpolated values where possible
# and where this doesn't fill in a value, just repeat the closest value
BC_pop = BC_pop.interpolate().bfill() 
# fix the indices obtained from concatenating
BC_pop = BC_pop.reset_index().drop(columns=['index'])


# import the population data for BC Regional Districts and WA Counties
WA_pop = pd.read_csv('Data/WAPop1990.csv')# index_col=0)
WA_pop = WA_pop.rename(columns={'Unnamed: 0':'Year'})

In [19]:
# BC Group 1 average
BC_1 = BC_pop.iloc[:,[3,7,9,10,12,14,18,20,25,28,29]]
BC_1_avg = pd.DataFrame(BC_1.mean(axis=1), columns=['BC Gp1 Avg'])
# BC Group 2 average
BC_2 = BC_pop.iloc[:,[2,5,15,17,19]]
BC_2_avg = pd.DataFrame(BC_2.mean(axis=1), columns=['BC Gp2 Avg'])
# BC Group 3 average
BC_3 = BC_pop.iloc[:,[6,8,11,13,16,22]]
BC_3_avg = pd.DataFrame(BC_3.mean(axis=1),columns=['BC Gp3 Avg'])
# BC Group 4 average
BC_4 = BC_pop.iloc[:,[1,4,24,27]]
BC_4_avg = pd.DataFrame(BC_4.mean(axis=1),columns=['BC Gp4 Avg'])
# BC Group 5 average
BC_5 = BC_pop.iloc[:,[21,23,26]]
BC_5_avg = pd.DataFrame(BC_5.mean(axis=1),columns=['BC Gp5 Avg'])

# WA Group 1 average
WA_1 = WA_pop.iloc[:,[1,3,5,6,8,9,13,16,19,20,21,23,27,29,31,32,34,36,37,39]]
WA_1_avg = pd.DataFrame(WA_1.mean(axis=1),columns=['WA Gp1 Avg'])
# WA Group 2 average
WA_2 = WA_pop.iloc[:,[2,4,18,22,24,25,26,28,30,33,35]]
WA_2_avg = pd.DataFrame(WA_2.mean(axis=1),columns=['WA Gp2 Avg'])
# WA Group 3 average
WA_3 = WA_pop.iloc[:,[11,14,15,17,38]]
WA_3_avg = pd.DataFrame(WA_3.mean(axis=1),columns=['WA Gp3 Avg'])
# WA Group 4 average
WA_4 = WA_pop.iloc[:,[7,10,12]]
WA_4_avg = pd.DataFrame(WA_4.mean(axis=1),columns=['WA Gp4 Avg'])

##### The grouped population variables

It is better to group the regional populations into larger trend groups; these will be the actual population variables to use.


In [20]:
BC_df = pd.concat([  BC_1_avg,
                             BC_2_avg,
                             BC_3_avg,
                             BC_4_avg,
                             BC_5_avg ]
                  , axis=1)
                   
WA_df = pd.concat([ WA_1_avg,
                             WA_2_avg,
                             WA_3_avg ,
                             WA_4_avg ]
                  , axis=1)

populations = pd.concat([WA_pop['Year'] , BC_df, WA_df ]  , axis=1) 
populations  = populations[:28]

### Port of Vancouver tonnage cargo

In [21]:
# data on total tonnage of cargo through Port of Vancouver
# years 2008-2023 are given in the Port Metro Vancouver Statistics Overviews at https://www.portvancouver.com/about-us/statistics/
# years 1994-1998 are given in The Institutional Position of Seaports An International Comparison by Henrik Stevens, Chapter 7
PortofVan = pd.DataFrame({'Year':[1994,1995,1996,1997,1998]+[i for i in range(2008,2023)],  
                          'Tonnage':[67600000,71500000,72000000,73500000,71900000]+[114561990,101887824,118378885, 
                                122499631,123876885,135009878, 
                                139638157,138082585,135538055, 
                                142067550,147090934,144225630, 
                                145450722,146473626,141416326]})

missing_yrs_port = pd.DataFrame({'Year':[1990,1991,1992,1993,1999,2000,2001,2002,2003,2004,2005,2006,2007],
                               'Tonnage':[np.nan for _ in range(0,13)]})

# linearly interpolate the port traffic data
Port_of_Van = pd.concat([PortofVan,missing_yrs_port]).sort_values(by=['Year']).interpolate(method='linear', fill_value='extrapolate', limit_direction='both')
Port_of_Van = Port_of_Van.reset_index(drop=True)
Port_of_Van = Port_of_Van[:28]

In [22]:
human_effects = pd.merge(Catches, populations,on = 'Year', how = 'outer')
human_effects = pd.merge(human_effects, Port_of_Van, on = 'Year', how = 'outer')

Combine obtained Data

In [23]:
#Combine the relevant data into a single data frame
df_merge = pd.merge(env_factors, biotic_factors, on="Year", how="outer")
all_factors = pd.merge(df_merge, human_effects, on="Year", how="outer")


all_data = pd.merge(Salmon, all_factors, on="Year", how="outer")


In [24]:
#Without year column
salmons_features = all_data.drop(columns=['Year'])

In [25]:
salmonTypes_List

['Cowichan Chinook',
 'Harrison Chinook',
 'Puntledge Chinook',
 'Big Qualicum Coho']

In [26]:
#Only one salmon Type
type_number=0
alldata_CC = all_data.drop(columns= [salmonTypes_List[i] for i in range(4) if i != type_number] ) 

type_number=1
alldata_HC = all_data.drop(columns= [salmonTypes_List[i] for i in range(4) if i != type_number] )

type_number = 2
alldata_PC = all_data.drop(columns= [salmonTypes_List[i] for i in range(4) if i != type_number] )

type_number = 3
alldata_BQC = all_data.drop(columns= [salmonTypes_List[i] for i in range(4) if i != type_number] )

In [27]:
for i in range(4):
    print("For", salmonTypes_List[i], 'type_number is ', i, '\n')

For Cowichan Chinook type_number is  0 

For Harrison Chinook type_number is  1 

For Puntledge Chinook type_number is  2 

For Big Qualicum Coho type_number is  3 



In [28]:
#No Nan values
salmons_features_noNaN = salmons_features.dropna()

# Lasso functions

In [29]:
#NAME: scale_and_get_LassoCoeffs
#Creates a dataFrame of Lasso coefficients
                ##Drops NaN
                ##Scales the features  
                ##fits Lasso regression for different alpha values
                ##writes the Lasso coefficients into a DataFrame
    ##Takes 
            ###a dataframe that contains features and target column
            ###a string for target column name           
            ###a list for alpha values (default is prechosen_alphas)
    ##Returns a data frame of Lasso coeffs
            ###indexed with alpha values
            ###columns correspond to the features


prechosen_alphas = np.logspace(-5, -1, num=100001)
#[0.000001,0.00001,0.0001,0.0005,0.00075,0.001,0.0015,0.005,0.01,0.1,1]

def scale_and_get_LassoCoeffs (df, target_column, alphaList=prechosen_alphas):
    df=df.dropna().reset_index(drop=True)
    scaler = StandardScaler()
    X=df.drop([target_column], axis=1)
    scaled_data = scaler.fit_transform(X)
    target = df[target_column]
    
    coefs = np.zeros((len(alphaList), len(X.columns)))

    for i,alpha in enumerate(alphaList):
        lasso = Lasso(alpha = alpha
                      , max_iter = 1000000
                     )
        lasso.fit(scaled_data, target)
    
        coefs[i, :] = lasso.coef_
    
    lassoCoefs = pd.DataFrame(coefs, 
                index=["alpha = " + str(alpha) for alpha in alphaList],
                columns=[col for col in X.columns])

    return lassoCoefs

In [30]:
#NAME: get_FewFeautures
# Takes lasso coeffs and and desired max feature number
            ###Prints features
            
def get_FewFeautures (lassoCoefs, few=5):
    for j in range(len(lassoCoefs)):
        if len(lassoCoefs.iloc[j][lassoCoefs.iloc[j] != 0]) <= few:
            featureList = list(lassoCoefs.iloc[j][lassoCoefs.iloc[j] != 0].index)
            break

    return featureList

In [31]:
#NAME: getScore_df
# Takes lasso coeffs and Creates a DataFrame
                ##indexed with feature names
                ##first column corresponds their count of non-zero coefficients
                ##second column corresponds the sum of absolute values of coefficients
                ##by default no sorting
                ##sortby==True sorts on the second column (sum of absolute values)

def getScore_df(lassoCoefs, sortby=False):
    # Count the non-zero coefficients for each feature across different alphas
    count_nonzero_list = np.count_nonzero(lassoCoefs, axis=0)
     # Sum the absolute values of coefficients for each feature across different alphas
    total_score_list = np.abs(lassoCoefs).sum(axis=0)
    
    # Create a DataFrame
    score_df = pd.DataFrame({
        'Count': count_nonzero_list,
        'Sum': total_score_list
    })
    if sortby==True:
        score_df = score_df.sort_values(by='Sum', ascending=False)
    return score_df


In [32]:
#NAME:getScore_forAll
#Calculate Lasso regression coefficients and computes scores for multiple types of salmon.
     ##Takes:
       #Data Frame containing target columns for multiple types and the features
       #a list for alpha values (default is prechosen_alphas)
    ##returns data frame
        #indexed with feature names
        #two columns for each types corresponding  
                                ##the count of non-zero coefficients for each feature
                                ## the sum of absolute values of coefficients



def applyLasso_getScore_forAll(df_all, alphaList=prechosen_alphas):
    for type_number in range(0,len(salmonTypes_List)):
        type_name=salmonTypes_List[type_number]
        df_oneType= df_all.drop(columns=[salmonTypes_List[i] for i in range(0, 4) if i != type_number])
        df_lassoCoeffs = scale_and_get_LassoCoeffs(df_oneType, 
                                                   target_column=type_name, 
                                                   alphaList=alphaList)
        score_df = getScore_df(df_lassoCoeffs, sortby=False)
        column_mapping = {col: type_name +'_' + col for col in score_df.columns}
        if type_number==0:
            scoreAll_df=score_df.rename(columns=column_mapping)
        else:
            scoreAll_df = scoreAll_df.join(score_df.rename(columns=column_mapping), 
                                           how='outer', 
                                           sort=False)

    return scoreAll_df 
        

In [33]:
def getFewfeat_forAll (df_all, few=5, alphaList=prechosen_alphas):
    featureAll={}
    for type_number in range(0, 4):
        type_name=salmonTypes_List[type_number]
        df_oneType= df_all.drop(columns=[salmonTypes_List[i] for i in range(0, 4) if i != type_number])
        df_lassoCoeffs = scale_and_get_LassoCoeffs(df_oneType, 
                                                   target_column=type_name, 
                                                   alphaList=alphaList)
        featureList = get_FewFeautures(df_lassoCoeffs, few=few)
        print('For', type_name, featureList)
        featureAll[type_name] = featureList
        score_df = getScore_df(df_lassoCoeffs, sortby=False)
        column_mapping = {col: type_name +'_' + col for col in score_df.columns}
        if type_number==0:
            scoreAll_df=score_df.rename(columns=column_mapping)
        else:
            scoreAll_df = scoreAll_df.join(score_df.rename(columns=column_mapping), 
                                           how='outer', 
                                           sort=False)

    return scoreAll_df , featureAll

# Apply

In [34]:
scores, features = getFewfeat_forAll(salmons_features)

For Cowichan Chinook ['Avg Sea Salinity', 'pdo', 'Killer whale', 'TotBiom', 'Euphs']
For Harrison Chinook ['SST Feb.', 'SST Mar.', 'Fish', 'NonCalCops', 'Tonnage']
For Puntledge Chinook ['Min flow rate', 'AmphiGam', 'CalCops.med', 'Av_Mysids', 'Av_Repantia']
For Big Qualicum Coho ['TotBiom', 'CalCops.larg', 'CalCops.smal', 'Av_Repantia', 'Coho Catch']


In [35]:
display(features)

{'Cowichan Chinook': ['Avg Sea Salinity',
  'pdo',
  'Killer whale',
  'TotBiom',
  'Euphs'],
 'Harrison Chinook': ['SST Feb.', 'SST Mar.', 'Fish', 'NonCalCops', 'Tonnage'],
 'Puntledge Chinook': ['Min flow rate',
  'AmphiGam',
  'CalCops.med',
  'Av_Mysids',
  'Av_Repantia'],
 'Big Qualicum Coho': ['TotBiom',
  'CalCops.larg',
  'CalCops.smal',
  'Av_Repantia',
  'Coho Catch']}

In [36]:
for i in range(4):
    type_name=salmonTypes_List[i]
    display(scores.sort_values(type_name+'_Sum', ascending=False).head(5))

Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
Avg Sea Salinity,64317,125.849654,0,0.0,46233,48.854879,0,0.0
pdo,56830,51.549794,0,0.0,0,0.0,0,0.0
Euphs,54535,50.907301,19774,13.039684,0,0.0,46671,46.13999
TotBiom,59730,49.457145,0,0.0,0,0.0,64411,73.694122
Min flow rate,46159,40.159776,0,0.0,47097,8.571779,9065,0.044662


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
SST Feb.,30790,6.871111,72329,669.809911,36107,9.337855,44362,38.772599
NonCalCops,0,0.0,69709,517.835832,0,0.0,0,0.0
Av_Cladocera,0,0.0,66043,325.130754,0,0.0,0,0.0
SST Nov.,27006,3.415301,64800,313.305086,40799,45.430369,0,0.0
BC Gp5 Avg,0,0.0,58194,248.876153,0,0.0,22276,7.781876


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
AmphiGam,0,0.0,0,0.0,54036,92.08775,45156,43.513224
Av_Mysids,0,0.0,53079,91.263639,58017,91.325133,0,0.0
Avg Sea Salinity,64317,125.849654,0,0.0,46233,48.854879,0,0.0
SST Nov.,27006,3.415301,64800,313.305086,40799,45.430369,0,0.0
Av_Fish,0,0.0,0,0.0,42266,44.357841,0,0.0


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
Av_Repantia,0,0.0,33438,22.27286,18844,4.682594,52885,94.39572
CalCops.smal,0,0.0,0,0.0,0,0.0,56682,87.634975
TotBiom,59730,49.457145,0,0.0,0,0.0,64411,73.694122
Coho Catch,0,0.0,0,0.0,9732,0.343246,52677,70.370273
CalCops.larg,0,0.0,0,0.0,0,0.0,60407,48.879279


In [42]:
for i in range(4):
    type_name=salmonTypes_List[i]
    print(type_name, "sorted by _Sum")
    display(scores.sort_values(type_name+'_Sum', ascending=False).head(20))

Cowichan Chinook sorted by _Sum


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
Avg Sea Salinity,64317,125.849654,0,0.0,46233,48.854879,0,0.0
pdo,56830,51.549794,0,0.0,0,0.0,0,0.0
Euphs,54535,50.907301,19774,13.039684,0,0.0,46671,46.13999
TotBiom,59730,49.457145,0,0.0,0,0.0,64411,73.694122
Min flow rate,46159,40.159776,0,0.0,47097,8.571779,9065,0.044662
Medusae,42333,32.734952,61220,148.979682,0,0.0,0,0.0
Sea Level Point Atkinson,51328,29.087995,0,0.0,0,0.0,1360,0.013813
Max flow rate,41810,19.483312,60861,162.124407,0,0.0,0,0.0
SST Aug,44325,18.042231,0,0.0,0,0.0,0,0.0
Av_Siphonophorae,45747,17.209181,55200,66.577337,0,0.0,42115,30.497347


Harrison Chinook sorted by _Sum


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
SST Feb.,30790,6.871111,72329,669.809911,36107,9.337855,44362,38.772599
NonCalCops,0,0.0,69709,517.835832,0,0.0,0,0.0
Av_Cladocera,0,0.0,66043,325.130754,0,0.0,0,0.0
SST Nov.,27006,3.415301,64800,313.305086,40799,45.430369,0,0.0
BC Gp5 Avg,0,0.0,58194,248.876153,0,0.0,22276,7.781876
Repantia,0,0.0,52779,216.312013,21486,2.06438,0,0.0
Max flow rate,41810,19.483312,60861,162.124407,0,0.0,0,0.0
Medusae,42333,32.734952,61220,148.979682,0,0.0,0,0.0
Cladocera,35492,11.130659,53974,112.794484,24388,2.101206,7769,0.46218
Av_Mysids,0,0.0,53079,91.263639,58017,91.325133,0,0.0


Puntledge Chinook sorted by _Sum


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
AmphiGam,0,0.0,0,0.0,54036,92.08775,45156,43.513224
Av_Mysids,0,0.0,53079,91.263639,58017,91.325133,0,0.0
Avg Sea Salinity,64317,125.849654,0,0.0,46233,48.854879,0,0.0
SST Nov.,27006,3.415301,64800,313.305086,40799,45.430369,0,0.0
Av_Fish,0,0.0,0,0.0,42266,44.357841,0,0.0
BenthicLarv,0,0.0,0,0.0,41728,39.279776,0,0.0
CalCops.med,0,0.0,0,0.0,55637,26.277404,0,0.0
Av_Chaetognatha,0,0.0,0,0.0,44760,23.86039,0,0.0
SST Feb.,30790,6.871111,72329,669.809911,36107,9.337855,44362,38.772599
Min flow rate,46159,40.159776,0,0.0,47097,8.571779,9065,0.044662


Big Qualicum Coho sorted by _Sum


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
Av_Repantia,0,0.0,33438,22.27286,18844,4.682594,52885,94.39572
CalCops.smal,0,0.0,0,0.0,0,0.0,56682,87.634975
TotBiom,59730,49.457145,0,0.0,0,0.0,64411,73.694122
Coho Catch,0,0.0,0,0.0,9732,0.343246,52677,70.370273
CalCops.larg,0,0.0,0,0.0,0,0.0,60407,48.879279
Euphs,54535,50.907301,19774,13.039684,0,0.0,46671,46.13999
NPGO index,16598,1.716454,0,0.0,0,0.0,52012,44.054072
AmphiGam,0,0.0,0,0.0,54036,92.08775,45156,43.513224
SST Feb.,30790,6.871111,72329,669.809911,36107,9.337855,44362,38.772599
AmphiHyp,0,0.0,0,0.0,0,0.0,43043,31.683957


In [43]:
for i in range(4):
    type_name=salmonTypes_List[i]
    print(type_name, "sorted by _Count")
    display(scores.sort_values(type_name+'_Count', ascending=False).head(20))

Cowichan Chinook sorted by _Count


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
Avg Sea Salinity,64317,125.849654,0,0.0,46233,48.854879,0,0.0
TotBiom,59730,49.457145,0,0.0,0,0.0,64411,73.694122
pdo,56830,51.549794,0,0.0,0,0.0,0,0.0
Euphs,54535,50.907301,19774,13.039684,0,0.0,46671,46.13999
Sea Level Point Atkinson,51328,29.087995,0,0.0,0,0.0,1360,0.013813
Min flow rate,46159,40.159776,0,0.0,47097,8.571779,9065,0.044662
Av_Siphonophorae,45747,17.209181,55200,66.577337,0,0.0,42115,30.497347
SST Aug,44325,18.042231,0,0.0,0,0.0,0,0.0
Medusae,42333,32.734952,61220,148.979682,0,0.0,0,0.0
Max flow rate,41810,19.483312,60861,162.124407,0,0.0,0,0.0


Harrison Chinook sorted by _Count


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
SST Feb.,30790,6.871111,72329,669.809911,36107,9.337855,44362,38.772599
NonCalCops,0,0.0,69709,517.835832,0,0.0,0,0.0
Av_Cladocera,0,0.0,66043,325.130754,0,0.0,0,0.0
SST Nov.,27006,3.415301,64800,313.305086,40799,45.430369,0,0.0
Medusae,42333,32.734952,61220,148.979682,0,0.0,0,0.0
Max flow rate,41810,19.483312,60861,162.124407,0,0.0,0,0.0
BC Gp5 Avg,0,0.0,58194,248.876153,0,0.0,22276,7.781876
Av_Siphonophorae,45747,17.209181,55200,66.577337,0,0.0,42115,30.497347
Cladocera,35492,11.130659,53974,112.794484,24388,2.101206,7769,0.46218
Av_Mysids,0,0.0,53079,91.263639,58017,91.325133,0,0.0


Puntledge Chinook sorted by _Count


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
Av_Mysids,0,0.0,53079,91.263639,58017,91.325133,0,0.0
CalCops.med,0,0.0,0,0.0,55637,26.277404,0,0.0
AmphiGam,0,0.0,0,0.0,54036,92.08775,45156,43.513224
Min flow rate,46159,40.159776,0,0.0,47097,8.571779,9065,0.044662
Avg Sea Salinity,64317,125.849654,0,0.0,46233,48.854879,0,0.0
Av_Chaetognatha,0,0.0,0,0.0,44760,23.86039,0,0.0
SST Oct.,0,0.0,0,0.0,44161,5.479853,0,0.0
Av_Fish,0,0.0,0,0.0,42266,44.357841,0,0.0
BenthicLarv,0,0.0,0,0.0,41728,39.279776,0,0.0
SST Nov.,27006,3.415301,64800,313.305086,40799,45.430369,0,0.0


Big Qualicum Coho sorted by _Count


Unnamed: 0,Cowichan Chinook_Count,Cowichan Chinook_Sum,Harrison Chinook_Count,Harrison Chinook_Sum,Puntledge Chinook_Count,Puntledge Chinook_Sum,Big Qualicum Coho_Count,Big Qualicum Coho_Sum
TotBiom,59730,49.457145,0,0.0,0,0.0,64411,73.694122
CalCops.larg,0,0.0,0,0.0,0,0.0,60407,48.879279
CalCops.smal,0,0.0,0,0.0,0,0.0,56682,87.634975
Av_Repantia,0,0.0,33438,22.27286,18844,4.682594,52885,94.39572
Coho Catch,0,0.0,0,0.0,9732,0.343246,52677,70.370273
NPGO index,16598,1.716454,0,0.0,0,0.0,52012,44.054072
Av_BenthicLarv,0,0.0,0,0.0,0,0.0,46821,13.423681
Euphs,54535,50.907301,19774,13.039684,0,0.0,46671,46.13999
AmphiGam,0,0.0,0,0.0,54036,92.08775,45156,43.513224
SST Feb.,30790,6.871111,72329,669.809911,36107,9.337855,44362,38.772599


In [37]:
##### This should also hold Lasso Coeffs.
##But i have never try it. 

def getAll_forAll (df_all, few=5, alphaList=prechosen_alphas):
    featureAll={}
    allCoef = pd.DataFrame()
    for type_number in range(0, 4):
        type_name=salmonTypes_List[type_number]
        df_oneType= df_all.drop(columns=[salmonTypes_List[i] for i in range(0, 4) if i != type_number])
        df_lassoCoeffs = scale_and_get_LassoCoeffs(df_oneType, 
                                                   target_column=type_name, 
                                                   alphaList=alphaList)
        featureList = get_FewFeautures(df_lassoCoeffs, few=few)
        print('For', type_name, featureList)
        featureAll[type_name] = featureList
        score_df = getScore_df(df_lassoCoeffs, sortby=False)
        column_mapping = {col: type_name +'_' + col for col in score_df.columns}
        column_mappingLas = {col: type_name +'_' + col for col in score_df.columns}
        if type_number==0:
            scoreAll_df=score_df.rename(columns=column_mapping)
            allCoef = df_lassoCoeffs.rename(columns=column_mappingLas)
        else:
            scoreAll_df = scoreAll_df.join(score_df.rename(columns=column_mapping), 
                                           how='outer', 
                                           sort=False)
            allCoef = allCoef.join(score_df.rename(columns=column_mappingLas), 
                                           how='outer', 
                                           sort=False)

    return scoreAll_df , featureAll, allCoef

scores, features, LasCoefs = getAll_forAll(salmons_features)

## K Fold

selected_type = type_name

scaler = StandardScaler()
X=salmon_train.drop([selected_type], axis=1)
scaled_data = scaler.fit_transform(X)
target = salmon_train[selected_type]

mses_lasso = np.zeros((len(beta), 5))

kf = KFold(5,
             random_state = 614,
             shuffle=True)



    

j=0
for train_index, test_index in kf.split(salmon_train):
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = target.iloc[train_index]
    y_test = target.iloc[test_index]
    
    i=0
    for alpha_val in beta:

        lasso = Lasso(alpha=alpha_val) 
        
        lasso.fit(X_train, y_train)  
        y_pred = lasso.predict(X_test)
        
        mses_lasso[i, j] = mean_squared_error(y_test, y_pred)
        
        i=i+1
    j=j+1


np.min(np.mean(np.sqrt(mses_lasso), axis=1))