# Analysis of fish dataset, Performing statistical analysis

In [1]:
# importing the required libraries for the project
import numpy as np   # handling with arrays
import pandas as pd  # handling with dataframes
import matplotlib.pyplot as plt # ploting the data 
from scipy.optimize import curve_fit # optimizing nonlinear curve
from sklearn.model_selection import train_test_split # spliting data for training a model
from sklearn.linear_model import LinearRegression # Linear regression model 
from sklearn.metrics import mean_squared_error # Squared error estimation 

### Read csv file of fish data 

In [2]:
# Read csv data from given source
Data = pd.read_csv('../input/dataset-fishes/fish_data (1).csv')
Data.head()

Unnamed: 0,number,age,water_temperature,fish_length
0,1,14,25,620
1,2,28,25,1315
2,3,41,25,2120
3,4,55,25,2600
4,5,69,25,3110


In [3]:
# Drop the index number for fishes 
DF =Data.drop(["number"], axis = 1)

### Generate D matrix from data
- Generate the data matrix D where the rows associate with each fish, and columns with age of the fish, water temperature in degrees Celsius, the length of the fish; respectively.

In [4]:
D = DF.to_numpy()
D

array([[  14,   25,  620],
       [  28,   25, 1315],
       [  41,   25, 2120],
       [  55,   25, 2600],
       [  69,   25, 3110],
       [  83,   25, 3535],
       [  97,   25, 3935],
       [ 111,   25, 4465],
       [ 125,   25, 4530],
       [ 139,   25, 4570],
       [ 153,   25, 4600],
       [  14,   27,  625],
       [  28,   27, 1215],
       [  41,   27, 2110],
       [  55,   27, 2805],
       [  69,   27, 3255],
       [  83,   27, 4015],
       [  97,   27, 4315],
       [ 111,   27, 4495],
       [ 125,   27, 4535],
       [ 139,   27, 4600],
       [ 153,   27, 4600],
       [  14,   29,  590],
       [  28,   29, 1305],
       [  41,   29, 2140],
       [  55,   29, 2890],
       [  69,   29, 3920],
       [  83,   29, 3920],
       [  97,   29, 4515],
       [ 111,   29, 4520],
       [ 125,   29, 4525],
       [ 139,   29, 4565],
       [ 153,   29, 4566],
       [  14,   31,  590],
       [  28,   31, 1205],
       [  41,   31, 1915],
       [  55,   31, 2140],
 

### Calculate emperical mean and covariance
- (a) Generate the empirical mean x and empirical covariance of the observations of the data matrix D and provided that the observation vectors are row.

In [5]:
# calculating mean of array 
emperical_mean = D.mean(axis = 0)
print(f"This is emperical mean :{emperical_mean}")

This is emperical mean :[  83.18181818   28.         3107.43181818]


In [6]:
# calculating covariance of data
emperical_covariance = np.cov(D)

### check if the matrix is postive definite 
- i. Print both empirical mean vector and covariance matrix. 
- ii. Verify whether E is a positive definite matrix.

In [7]:
print(f"This is emperical covariance matrix :{emperical_covariance}")

This is emperical covariance matrix :[[ 120230.33333333  257906.5         417703.83333333 ...  620658.5
   634631.66666667  624584.83333333]
 [ 257906.5         553413.          896378.5        ... 1332379.5
  1362455.         1340990.5       ]
 [ 417703.83333333  896378.5        1451920.33333333 ... 2158334.
  2207086.66666667 2172359.33333333]
 ...
 [ 620658.5        1332379.5        2158334.         ... 3209677.
  3282386.         3231015.        ]
 [ 634631.66666667 1362455.         2207086.66666667 ... 3282386.
  3356777.33333333 3304288.66666667]
 [ 624584.83333333 1340990.5        2172359.33333333 ... 3231015.
  3304288.66666667 3252682.33333333]]


In [8]:
# create a function to check if the matrix is positve difinite 
def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0) # checking if all the eigen values are positive 

In [9]:
# Calling function to see the output 
# False Matrix is not positive definite 
print(f"Is Matrix positive definite ?: {is_pos_def(emperical_covariance)}")

Is Matrix positive definite ?: False


### Calculating M matrix of Mahalanobis distance of data points.
- iii. Generate a matrix M where in each entry mij the Mahalanobis distance of data point xi from xj is stored . Provide M as a "csv file.
<br>
<img src = 'https://www.machinelearningplus.com/wp-content/uploads/2019/04/3_Mahalanobis_Distance_Formula-min.png' >

In [10]:
# creating a function to find mahalanobis distance matrix M 
def mahalanobis_distance(data, variables):
    mean_diff = data - np.mean(variables)  # first calculate difference of data point and mean
    cov = np.cov(variables.values.T) # find the covariance matrix 
    inverse_covariance = np.linalg.inv (cov) # find the inverse of covariance matrix 
    left = np.dot (mean_diff , inverse_covariance) #  Dot multiplication of both above calculation
    m_distance = np.dot (left, mean_diff.T) # calculating the distance for each point 
    return m_distance.diagonal() # returning Mahalanobis matrix 

In [11]:
# Saving mahalanobis matrix in M variable
M = mahalanobis_distance(DF, DF[['age', 'water_temperature', 'fish_length']])
# prinit the matrix 
print(f"Matrix for Mahlanobis distance is : {M}")

Matrix for Mahlanobis distance is : [7.03633203 4.57837331 2.77298237 2.18187909 1.86951454 1.79637231
 1.91728429 2.42648564 2.64138662 3.63174269 5.48238326 4.35278315
 2.61553035 1.09308172 0.91464745 0.79454453 2.0877084  1.88088348
 1.47091725 1.22358482 1.7851823  3.23381697 3.64349679 1.9107658
 1.26915788 1.74687039 5.15338276 2.68144919 4.15389658 2.39949237
 1.6079406  1.79591768 2.86235073 4.6004626  3.34844692 2.81578802
 2.18404041 2.1122793  1.92216562 1.89792019 2.81555901 4.09143845
 6.22029394 9.97946817]


In [12]:
# Generate csv file for matrix
M_df = pd.DataFrame(M)
# saving M matrix in myfile.csv 
M_df.to_csv('Mahalanobis_Distance.csv')

### Calculating points
- Pick two data points xi and x's with the maximum Mahalanobis distance that is stored in M.

In [13]:
# finding the maximum points of M matrix 
maximum_point = M.max()
DF["M_Distance"] = M
row = DF.loc[DF['M_Distance'] == maximum_point]

In [14]:
# corrsponding x1 and x2 values having maximum mahalonobis distance
row
row_arr = row.to_numpy()
point_x1 = row_arr[0,0]
point_x2 = row_arr[0,1]
# priniting these points
print(f"two data points x1' and x2' with the maximum Mahalanobis distance are: {point_x1}, {point_x2}")

two data points x1' and x2' with the maximum Mahalanobis distance are: 153.0, 31.0


- Classify the data points according the distance from x1', and x2', that means create two disjoint lists L1 and L2 where the distance elements in Li is closer to xi' for i = 1, 2. 
- Provide your list as a "csv' file

In [15]:
# creating disjoint lists L1 and L2
L1 = []
L2 = []
# loop over the data to find the distance elements closer to xi and append in L1
for i in range(len(DF)) :
    if((153-DF.loc[i, "age"])<20): # finding closer elements 
        L1.append(DF.loc[i, "age"])  # appending found values in L1
for i in range(len(DF)) :
    if((31-DF.loc[i, "water_temperature"])<10):
        L2.append(DF.loc[i, "water_temperature"])        
# printing these Lists         
print(f"two disjoint lists L1 and L2 where the distance elements in Li is closer to xi' are : {L1}, {L2}")

two disjoint lists L1 and L2 where the distance elements in Li is closer to xi' are : [139, 153, 139, 153, 139, 153, 139, 153], [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31]


In [16]:
L1_df = pd.DataFrame(L1)
L2_df = pd.DataFrame(L2)
# saving L1 and L2 lists L1.csv and L2.csv 
L1_df.to_csv('L1.csv')
L2_df.to_csv('L2.csv')

# Question B linear regression parameters 
- (b) Let the length variable (length of the fish) in the data set bedependent on the age and water temperature. 
   - i. Find parameters for the linear regression where we considerthe loss function to be the square of residuals.
   - ii. Compute the empirical risk Rempf for the linear regression.

In [17]:
# calling linear regression function
lg = LinearRegression()

In [18]:
# lets select dependent variables from data and output 
Y = DF['fish_length']
X = DF.drop(["fish_length","M_Distance"], axis = 1)

In [19]:
# split the data to train regression model and test 
X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

In [20]:
# fit the regression model on training part of data
lg.fit(X_train, y_train)
# Predict
y_predicted = lg.predict(X_test)
# model evaluation
# calculating loss where, function to be the square of residuals
rmse = mean_squared_error(y_test, y_predicted)
print(f"loss square of residuals is : {rmse}")

loss square of residuals is : 555203.2927726381


In [21]:
# printing parameters of the regression model 
print(f"Coefficents of the linear regression model are : {lg.coef_}")
print(f"Intercept of the linear regression model is : {lg.intercept_}")

Coefficents of the linear regression model are : [  26.05301191 -148.25939503]
Intercept of the linear regression model is : 5022.715950835644


In [22]:
# equation to predict the new value 
print(f"Equation for linear regression : Y ={lg.intercept_} + {lg.coef_}x")

Equation for linear regression : Y =5022.715950835644 + [  26.05301191 -148.25939503]x


## Emperical risk calculation 
<img src = 'https://i.stack.imgur.com/xuzw3.png'> 

In [23]:
# Calculating Emperical risk using given formula 
Square_loss = sum((y_test - y_predicted)**2)
Remp = Square_loss / 2*len(y_test)
print(f"Emperical risk 'Remp' for linear regression is : {Remp}")

Emperical risk 'Remp' for linear regression is : 22485733.35729185


### Parameters for non linear regression 
- ii. Find parameters for the following non-linear regression with the feature map
<br>
 `Y = 1 + x1 + x2 + x1*x2`

In [24]:
#Fitting function for non linear equation given 
def fitting_func(x, a, b,c,d):
    z = 1 + a*x[:,1] +b*x[:,0] + c *(x[:,1]*x[:,0]) 
    return z
    #return a*x+b

In [25]:
# Initial guess for the parameters
initialGuess = [1.0,1.0,1.0,1.0]    

#Perform the curve-fit
popt, pcov = curve_fit(fitting_func, D[:,0:2], D[:,2], initialGuess)
# here are the parameters for nonlinear regression 
print(f"Here are the parameters for nonlinear regression : {popt}")

Here are the parameters for nonlinear regression : [33.02838277 73.62600774 -1.69270558  1.        ]


