<a href="https://colab.research.google.com/github/Nithin-Siddhartha/Nithin-Siddhartha/blob/main/VAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# library for manipulating the csv data
import pandas as pd

# library for scientific calculations on numbers + linear algebra
import numpy as np
import math

# library for regular plot visualizations
import matplotlib.pyplot as plt

#library for responsive visualizations
import plotly.express as px


In [13]:
data=pd.read_csv("swedish_insurance.csv")
data

Unnamed: 0,X,Y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4
...,...,...
58,9,87.4
59,31,209.8
60,14,95.5
61,53,244.6


In [14]:
print(data.columns)
data.head(10)

Index(['X', 'Y'], dtype='object')


Unnamed: 0,X,Y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4
5,57,170.9
6,23,56.9
7,14,77.5
8,45,214.0
9,10,65.3


In [15]:
fig = px.box(data['X'], points = 'all')
fig.update_layout(title = f'Distribution of X',title_x=0.5, yaxis_title= "Number of Insurance Claims")
fig.show()

fig = px.box(data['Y'], points = 'all')
fig.update_layout(title = f'Distribution of Y',title_x=0.5, yaxis_title= "Amount of Insurance Paid")
fig.show()

In [16]:

fig = px.scatter(x = data['X'], y=data['Y'])
fig.update_layout(title = 'Swedish Automobiles Data', title_x=0.5, xaxis_title= "Number of Claims", yaxis_title="Payment in Claims", height = 500, width = 700)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.show()


**Calculate Mean and Variance**

In [17]:

data['Y']

0     392.5
1      46.2
2      15.7
3     422.2
4     119.4
      ...  
58     87.4
59    209.8
60     95.5
61    244.6
62    187.5
Name: Y, Length: 63, dtype: float64

In [19]:
mean_x = np.mean(data['X'])
mean_y = np.mean(data['Y'])

var_x = np.var(data['X'])
var_y = np.var(data['Y'])


print('x stats: mean= %.3f   variance= %.3f' % (mean_x, var_x))
print('y stats: mean= %.3f   variance= %.3f' % (mean_y, var_y))

x stats: mean= 22.905   variance= 536.658
y stats: mean= 98.187   variance= 7505.052


**Calculate Covariance**

In [20]:

# Calculate covariance between x and y
def covariance(x, y):
    mean_x = np.mean(x)
    mean_y = np.mean(y)
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar/len(x)



covar_xy = covariance(data['X'], data['Y'])
print(f'Cov(X,Y): {covar_xy}')

Cov(X,Y): 1832.0543461829182


In [21]:
b1 = covar_xy / var_x
b0 = mean_y - b1 * mean_x

print(f'Coefficents:\n b0: {b0}  b1: {b1} ')

Coefficents:
 b0: 19.99448575911481  b1: 3.413823560066367 


**Make Predictions**

In [22]:
x = data['X'].values.copy()
x

array([108,  19,  13, 124,  40,  57,  23,  14,  45,  10,   5,  48,  11,
        23,   7,   2,  24,   6,   3,  23,   6,   9,   9,   3,  29,   7,
         4,  20,   7,   4,   0,  25,   6,   5,  22,  11,  61,  12,   4,
        16,  13,  60,  41,  37,  55,  41,  11,  27,   8,   3,  17,  13,
        13,  15,   8,  29,  30,  24,   9,  31,  14,  53,  26])

In [23]:
# Taking the values from the dataframe and sorting only X for the ease of plotting line later on
x = data['X'].values.copy()
# x.sort()
print(f'x: {x}')

# Predicting the new data based on calculated coeffiecents. 
y_hat = b0 + b1 * x
print(f'\n\ny_hat: {y_hat}')

y = data['Y'].values
print(f'\n\ny: {y}')

x: [108  19  13 124  40  57  23  14  45  10   5  48  11  23   7   2  24   6
   3  23   6   9   9   3  29   7   4  20   7   4   0  25   6   5  22  11
  61  12   4  16  13  60  41  37  55  41  11  27   8   3  17  13  13  15
   8  29  30  24   9  31  14  53  26]


y_hat: [388.68743025  84.8571334   64.37419204 443.30860721 156.54742816
 214.58242868  98.51242764  67.7880156  173.61654596  54.13272136
  37.06360356 183.85801664  57.54654492  98.51242764  43.89125068
  26.82213288 101.9262512   40.47742712  30.23595644  98.51242764
  40.47742712  50.7188978   50.7188978   30.23595644 118.995369
  43.89125068  33.64978     88.27095696  43.89125068  33.64978
  19.99448576 105.34007476  40.47742712  37.06360356  95.09860408
  57.54654492 228.23772292  60.96036848  33.64978     74.61566272
  64.37419204 224.82389936 159.96125172 146.30595748 207.75478156
 159.96125172  57.54654492 112.16772188  47.30507424  30.23595644
  78.02948628  64.37419204  64.37419204  71.20183916  47.30507424
 118.99536

**Visual Comparison for Correctness**

In [24]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(x=data['X'], y=data['Y'], name='train', mode='markers', marker_color='rgba(152, 0, 0, .8)'))
fig.add_trace(go.Scatter(x=data['X'], y=y_hat, name='prediction', mode='lines+markers', marker_color='rgba(0, 152, 0, .8)'))

fig.update_layout(title = f'Swedish Automobiles Data\n (visual comparison for correctness)',title_x=0.5, xaxis_title= "Number of Claims", yaxis_title="Payment in Claims")
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.show()