### Import libraries

In [None]:
"""
Written by Kristoffer Rakstad Solberg
Summer Intern, AKBM Transformation
2019
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from utils.load_catch_from_bigquery import load_BQ_and_clean

### Functions for filtering dataset

In [None]:
def filter_catch(df):
    
    #df = df[df['Total catch Krill - Mt'].notnull()]
    df = df[df['Baric pressure (hPa)'].notnull()]
    df = df[df['Water temp (Celsius)'].notnull()]
    df = df[df['Krill Size (mm)'].notnull()]
    df = df[df['Wind speed (kn)'].notnull()]
    df = df[df['Krill weight (gram)'].notnull()]    

    df = df.drop(columns=['Total Krill Meal Kg', 'Trawl depth (m)', 'TripNumber', 
                              'Wind direction','FishingCondition','ReasonForNoOrBadFishing'])
    
    return df

### Extract the dataframe

In [None]:
%%time
if __name__ == '__main__':
    # Get data
    df = load_BQ_and_clean()
    
    # remove bad rows
    df = filter_catch(df)

    # create month
    df['month'] = df['Date'].dt.month
    
    # re-index
    df.index = pd.RangeIndex(len(df.index))

In [None]:
df.describe()

## Exploratory Data Analysis (EDA)

In [None]:
#sns.jointplot(x='Baric pressure (hPa)',y='Krill Size (mm)',data=df,kind='reg')
sns.heatmap(df.corr(),cmap='coolwarm',annot=True)

In [None]:
sns.jointplot(x='Latitude',y='Krill Size (mm)',data=df)

In [None]:
a4_dims = (15.7, 8.27)
f, axes = plt.subplots(1, 2, figsize=a4_dims)
sns.distplot(df['Krill Size (mm)'], ax = axes[0])
sns.distplot(df['Krill weight (gram)'], ax = axes[1])

## Training a Multiple Linear Regression Model

Let's now begin to train out regression model! We will need to first split up our data into an X array that contains the features to train on, and a y array with the target variable, in this case the Price column. We will toss out the Address column because it only has text info that the linear regression model can't use.

### X and y arrays

In [None]:
df.columns

In [None]:
# Features
X = df[['Baric pressure (hPa)', 'Water temp (Celsius)','Wind speed (kn)', 'month', 'Longitude', 'Latitude']]


# Target variable
y = df['Krill Size (mm)']

## Train Test Split
We have to split our dataset into a training part and a test part

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)

## Creating and Training the Model
Here we do the actual fitting in accordance to linear regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

## Model Evaluation

We evaluate the model by checking out it's coefficients and how we can interpret them.

In [None]:
# print the intercept
print(lm.intercept_)
print("The training model explains {:.2f}% of the krill size.".format(lm.score(X_train,y_train)*100))
print("The test model explains {:.2f}% of the krill size.".format(lm.score(X_test,y_test)*100))

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

## Predictions from our Model 
We're now able to evaluate how well our Model is trained

In [None]:
predictions= lm.predict(X_test)

In [None]:
g = sns.jointplot(x=y_test,y=predictions,kind='scatter')
x0, x1 = g.ax_joint.get_xlim()
y0, y1 = g.ax_joint.get_ylim()
lims = [max(x0, y0), min(x1, y1)]
g.ax_joint.plot(lims, lims, ':k')   
#plt.xlabel('Y Test')
#plt.ylabel('Predicted')

In [None]:
g = sns.jointplot(x=y_test,y=predictions,kind='kde')
x0, x1 = g.ax_joint.get_xlim()
y0, y1 = g.ax_joint.get_ylim()
lims = [max(x0, y0), min(x1, y1)]
g.ax_joint.plot(lims, lims, ':k')    

### Residual Histogram
If your residuals are normally distributed, it means your model is a
correct choice for the data. In this case our model uses a linear regression model

In [None]:
sns.distplot((y_test-predictions),bins=50);

# Regression Evaluation Metrics


Here are three common evaluation metrics for regression problems:

**Mean Absolute Error** (MAE) is the mean of the absolute value of the errors:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

**Mean Squared Error** (MSE) is the mean of the squared errors:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

**Root Mean Squared Error** (RMSE) is the square root of the mean of the squared errors:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

Comparing these metrics:

- **MAE** is the easiest to understand, because it's the average error.
- **MSE** is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
- **RMSE** is even more popular than MSE, because RMSE is interpretable in the "y" units.

All of these are **loss functions**, because we want to minimize them.

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

## Conclusion

In [None]:
coeffecients = pd.DataFrame(lm.coef_,X.columns)
coeffecients.columns = ['Coeffecient']
coeffecients

Interpreting the coefficients:

- Holding all other features fixed, a 1 unit increase in **Baric pressure (hPa)** is associated with an **increase of 0.084 Krill size (mm)**.
- Holding all other features fixed, a 1 unit increase in **Water temp (Celsius)** is associated with an **increase of 0.449 Krill size (mm)**.
- Holding all other features fixed, a 1 unit increase in **Wind speed (kn)** is associated with an **increase of 0.074 Krill size (mm)**.
- Holding all other features fixed, a 1 unit increase in **month** is associated with an **increase of -0.490 Krill size (mm)**.
- Holding all other features fixed, a 1 unit increase in **Longitude** is associated with an **increase of -0.150 Krill size (mm)**.
- Holding all other features fixed, a 1 unit increase in **Latitude** is associated with an **increase of 0.997 Krill size (mm)**.