## Example Linear Regression

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [5]:
p_df = sns.load_dataset('penguins')
p_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


We are going to predict the body mass based on the flipper length

In [10]:
p_df = p_df.dropna()


In [8]:
p_df = p_df.reset_index(drop=True)

In [9]:
p_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


## we want to predict the weight based on the flipper length


### X - Y split 

In [15]:
X = p_df['flipper_length_mm']
y = p_df['body_mass_g']

### Next we do the train set split 

In [13]:
from sklearn.model_selection import train_test_split


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 42)

In [24]:
X_train

22     187.0
284    221.0
294    212.0
56     185.0
175    205.0
       ...  
188    196.0
71     184.0
106    193.0
270    220.0
102    181.0
Name: flipper_length_mm, Length: 233, dtype: float64

### model generation

In [18]:
from sklearn.linear_model import LinearRegression


In [28]:
X_train = np.array(X_train).reshape(-1,1)
y_train = np.array(y_train).reshape(-1,1)

In [29]:
lm = LinearRegression()

In [30]:
model = lm.fit(X_train,y_train)

In [31]:
model.coef_

array([[51.32743793]])

In [32]:
model.intercept_

array([-6118.66754338])

### now we will test it 

In [33]:
y_test

25     3250.0
309    4875.0
73     4000.0
195    3675.0
57     4050.0
        ...  
15     3400.0
6      4675.0
209    4000.0
93     4100.0
30     4150.0
Name: body_mass_g, Length: 100, dtype: float64

In [34]:
X_test = np.array(X_test).reshape(-1,1)


In [37]:
y_test = np.array(y_test).reshape(-1,1)


In [38]:
y_pred = model.predict(X_test)

In [41]:
compare = pd.DataFrame({'y_test':list(y_test), 'y_pred':list(y_pred)})

In [42]:
compare

Unnamed: 0,y_test,y_pred
0,[3250.0],[3017.616407560008]
1,[4875.0],[5276.023676332345]
2,[4000.0],[3890.182852312955]
3,[3675.0],[4044.1651660928874]
4,[4050.0],[3736.200538533025]
...,...,...
95,[3400.0],[2812.306655853432]
96,[4675.0],[3890.182852312955]
97,[4000.0],[4506.112107432684]
98,[4100.0],[3736.200538533025]


### Model evaluation 

In [44]:
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse

In [45]:
mse(y_test, y_pred)

138931.09920252106

In [46]:
mae(y_test, y_pred)

299.39047755852704

In [47]:
import matplotlib.pyplot as plt