In [104]:
import pandas as pd                                   #importing pandas to do basic dataframe and data manipulation
from sklearn.linear_model import LinearRegression     #importing the Linear Regression model

In [105]:
data = pd.read_csv('Fish.csv')                     #reading the data file

In [106]:
data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [107]:
data.describe()               #describing the data

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


In [108]:
data.info()             #printing the data information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [109]:
y = data.Weight          #Weight column used as the dependent variable

# We are not using any encoder to convert the Species categeory to numerical values

In [110]:
X_without_encoding = data.drop(columns=['Weight','Species'],axis=1)     #Dropping species and weights column and rest colluns are used as independent variables
X_without_encoding

Unnamed: 0,Length1,Length2,Length3,Height,Width
0,23.2,25.4,30.0,11.5200,4.0200
1,24.0,26.3,31.2,12.4800,4.3056
2,23.9,26.5,31.1,12.3778,4.6961
3,26.3,29.0,33.5,12.7300,4.4555
4,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...
154,11.5,12.2,13.4,2.0904,1.3936
155,11.7,12.4,13.5,2.4300,1.2690
156,12.1,13.0,13.8,2.2770,1.2558
157,13.2,14.3,15.2,2.8728,2.0672


In [111]:
model_without_encoding = LinearRegression()         #building the model

In [112]:
model_without_encoding.fit(X_without_encoding,y)    #training the model

LinearRegression()

In [113]:
model_without_encoding.score(X_without_encoding,y)    #Calculating the accuraccy without using encoding in Species column

0.8852867046546207

# We will use use dummy encoder to convert the Species categeory to numerical values

In [114]:
X = data.drop(columns=['Weight','Species'],axis=1)    #Dropping species and weights column and rest colluns are used as independent variables

In [115]:
pd.get_dummies(data['Species'])          #using get_dummies to convert species into encoded columns

Unnamed: 0,Bream,Parkki,Perch,Pike,Roach,Smelt,Whitefish
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
154,0,0,0,0,0,1,0
155,0,0,0,0,0,1,0
156,0,0,0,0,0,1,0
157,0,0,0,0,0,1,0


In [116]:
X_with_dummies = pd.concat([X,pd.get_dummies(data['Species'])],axis=1)      #merging the dummies columns with X Dataframe to X_with_dummies Dataframe 
X_with_dummies

Unnamed: 0,Length1,Length2,Length3,Height,Width,Bream,Parkki,Perch,Pike,Roach,Smelt,Whitefish
0,23.2,25.4,30.0,11.5200,4.0200,1,0,0,0,0,0,0
1,24.0,26.3,31.2,12.4800,4.3056,1,0,0,0,0,0,0
2,23.9,26.5,31.1,12.3778,4.6961,1,0,0,0,0,0,0
3,26.3,29.0,33.5,12.7300,4.4555,1,0,0,0,0,0,0
4,26.5,29.0,34.0,12.4440,5.1340,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
154,11.5,12.2,13.4,2.0904,1.3936,0,0,0,0,0,1,0
155,11.7,12.4,13.5,2.4300,1.2690,0,0,0,0,0,1,0
156,12.1,13.0,13.8,2.2770,1.2558,0,0,0,0,0,1,0
157,13.2,14.3,15.2,2.8728,2.0672,0,0,0,0,0,1,0


In [117]:
X_with_dummies = X_with_dummies.drop(columns='Whitefish',axis=1)       # dropping the Whitefish column as after using encoding we can drop one columns as it  
X_with_dummies                                                         # can be derived from other columns

Unnamed: 0,Length1,Length2,Length3,Height,Width,Bream,Parkki,Perch,Pike,Roach,Smelt
0,23.2,25.4,30.0,11.5200,4.0200,1,0,0,0,0,0
1,24.0,26.3,31.2,12.4800,4.3056,1,0,0,0,0,0
2,23.9,26.5,31.1,12.3778,4.6961,1,0,0,0,0,0
3,26.3,29.0,33.5,12.7300,4.4555,1,0,0,0,0,0
4,26.5,29.0,34.0,12.4440,5.1340,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
154,11.5,12.2,13.4,2.0904,1.3936,0,0,0,0,0,1
155,11.7,12.4,13.5,2.4300,1.2690,0,0,0,0,0,1
156,12.1,13.0,13.8,2.2770,1.2558,0,0,0,0,0,1
157,13.2,14.3,15.2,2.8728,2.0672,0,0,0,0,0,1


In [118]:
model_with_dummies = LinearRegression()       #building the model

In [119]:
model_with_dummies.fit(X_with_dummies,y)      #training the model

LinearRegression()

In [120]:
model_with_dummies.score(X_with_dummies,y)    #Calculating the accuraccy using encoding in Species column

0.9360849020585845

# We can see that there is an increase in accuracy as we use Encoding to predict the output #