Train a model / algorithm to fit dataset provided and state the insights

In [67]:
#import pandas, numpy and maplotlib libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
#import data in csv
from google.colab import files
upload = files.upload()

Saving dataset.csv to dataset (1).csv


In [58]:
#convert data into dataframe
import io
data = pd.read_csv(io.BytesIO(upload['dataset.csv']))
data

Unnamed: 0,y,x
0,0,0.0
1,1,1.0
2,4,2.0
3,9,4.0
4,16,4.0
5,25,5.0
6,36,6.0
7,49,7.0
8,64,8.0
9,81,9.0


In [59]:
#Separating independent variables from dependent variables
#x, independent values and y, dependent values

#dependent variables
y = data.iloc[:,:-1].values

#independent variables
x = data.iloc[:,-1].values


#.values removes the headings, brings only the values and makes it an array
#call dependent variables
y

array([[  0],
       [  1],
       [  4],
       [  9],
       [ 16],
       [ 25],
       [ 36],
       [ 49],
       [ 64],
       [ 81],
       [100],
       [121],
       [144],
       [169],
       [196],
       [225],
       [256],
       [289],
       [324],
       [361],
       [400],
       [441],
       [484],
       [529],
       [576],
       [625]])

In [60]:
#call independent variables
x

array([ 0.,  1.,  2.,  4.,  4.,  5.,  6.,  7.,  8.,  9., nan, 11., 12.,
       13., 14., 14., 16., 17., 18., nan, 20., 21., 22., nan, 24., 25.])

In [27]:
#estimating the values of all missing values, NaNs in the numerical features 
#but the missing value provided is not suitable for the trend of the values in the column
#we know Dependent features doesn't have a missing value and so no need to include that range
from sklearn.impute import SimpleImputer

#reshaping data as it contains a single feature
#Independent.reshape(1, -1)

#fitting alternative values for the missing values
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer_now = imputer.fit(data[['x']])

 #missing values get transformed
Independent_new = imputer_now.transform(data[['x']]) 
Independent_new

array([[ 0.        ],
       [ 1.        ],
       [ 2.        ],
       [ 4.        ],
       [ 4.        ],
       [ 5.        ],
       [ 6.        ],
       [ 7.        ],
       [ 8.        ],
       [ 9.        ],
       [11.86956522],
       [11.        ],
       [12.        ],
       [13.        ],
       [14.        ],
       [14.        ],
       [16.        ],
       [17.        ],
       [18.        ],
       [11.86956522],
       [20.        ],
       [21.        ],
       [22.        ],
       [11.86956522],
       [24.        ],
       [25.        ]])

In [None]:
#remove duplicate values but only works on a specific column and not the total set hence not appropropriate for this step
# remove_dup = data.iloc[:,-1].drop_duplicates(keep = 'first')
# remove_dup

In [76]:
#alternative to filling missing values based on trend of data
#mean of column doesn't fit the best strategy used above because of the trend of the numbers
#use mask to hide other functions to be able to operate smoothly without boolean output

#duplicate the values and keep them as Nan
rem_dup = data['x'].mask(data['x'].duplicated(keep = False))

#replace duplicates by interpolating
Inde_new = rem_dup.interpolate()
Inde_new

new_x = pd.DataFrame({'new_x': Inde_new}).values
new_x

array([[ 0.],
       [ 1.],
       [ 2.],
       [ 3.],
       [ 4.],
       [ 5.],
       [ 6.],
       [ 7.],
       [ 8.],
       [ 9.],
       [10.],
       [11.],
       [12.],
       [13.],
       [14.],
       [15.],
       [16.],
       [17.],
       [18.],
       [19.],
       [20.],
       [21.],
       [22.],
       [23.],
       [24.],
       [25.]])

In [None]:
# data_new = (data['y'], new_x)
# pd.DataFrame(data_new)

In [85]:
#split data in Train set and Test set
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(new_x, y, test_size = 0.2, random_state = 0)

In [86]:
#training algorithm to fit data
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression()

In [87]:
#predict
#call the fitted model
y_pred = regressor.predict(x_test)
pd.DataFrame(y_pred)

Unnamed: 0,0
0,-51.749258
1,408.666172
2,255.194362
3,331.930267
4,24.986647
5,178.458457


In [89]:
#y_train
pd.DataFrame(y_test)

Unnamed: 0,0
0,4
1,400
2,196
3,289
4,25
5,121


In [93]:
#predict using a specific value from x_test that may not necessarily available in the data
#call the fitted model
y_pred = regressor.predict([[9]])
pd.DataFrame(y_pred)

Unnamed: 0,0
0,127.301187


In [90]:
#accuracy score
#perfect accuracy is 0, the farther away from 0, the least predictive is the train output
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

1971.922850865991