## TASK 4: Train  a  simple  linear  regressing  model  on  dataset and  predict  the output.

In [47]:
## importing important libraries

import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [16]:
# URLs of the HTML pages for training and testing data
train_html_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRTK2NvcndgPX41Czu6Ft2Ho_nE-z50BgTqdzwFW0rsJ2nvyNLe2DoIg1COzUbgw80oaRBjfy5-WtFk/pubhtml"
test_html_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRyvZ7lknwiSghK9aen1SaTEYoN3JS40rrGLpcyrsVZy1tB2T4gn6Y3-cdzPUFCPMmmqREWefW3kl4_/pubhtml"


# Send a GET request to fetch HTML content
train_html_response = requests.get(train_html_url)
test_html_response = requests.get(test_html_url)

# Parse the HTML content using BeautifulSoup
train_html_soup = BeautifulSoup(train_html_response.text, 'html.parser')
test_html_soup = BeautifulSoup(test_html_response.text, 'html.parser')

# Find the HTML table in the parsed content
train_html_table = train_html_soup.find('table')
test_html_table = test_html_soup.find('table')

# Read the HTML tables into DataFrames
train_df = pd.read_html(str(train_html_table))[0]
test_df = pd.read_html(str(test_html_table))[0]


In [17]:
train_df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,1,x,y
1,2,24,21.54945196
2,3,50,47.46446305
3,4,15,17.21865634
4,5,38,36.58639803
...,...,...,...
696,697,58,58.59500642
697,698,93,94.62509374
698,699,82,88.60376995
699,700,66,63.64868529


In [18]:
test_df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,1,x,y
1,2,77,79.77515201
2,3,21,23.17727887
3,4,22,25.60926156
4,5,20,17.85738813
...,...,...,...
296,297,71,68.5458879
297,298,46,47.33487629
298,299,55,54.09063686
299,300,62,63.29717058


## Renaming columns and altering the datasets in proper structure

In [32]:
# Removing first column
remove_first_train = train_df.iloc[1:, 1:].reset_index(drop=True)
remove_first_train.head()

remove_first_test = test_df.iloc[1:, 1:].reset_index(drop=True)
remove_first_test.head()

# Rename the columns for better clarity
remove_first_train.columns = ['X', 'Y']

remove_first_test.columns = ['X', 'Y']


train_df_proper = remove_first_train
test_df_proper = remove_first_test

In [33]:

train_df_proper

Unnamed: 0,X,Y
0,24,21.54945196
1,50,47.46446305
2,15,17.21865634
3,38,36.58639803
4,87,87.28898389
...,...,...
695,58,58.59500642
696,93,94.62509374
697,82,88.60376995
698,66,63.64868529


In [34]:
test_df_proper

Unnamed: 0,X,Y
0,77,79.77515201
1,21,23.17727887
2,22,25.60926156
3,20,17.85738813
4,36,41.84986439
...,...,...
295,71,68.5458879
296,46,47.33487629
297,55,54.09063686
298,62,63.29717058


In [35]:
train_df_proper.shape

(700, 2)

In [36]:
test_df_proper.shape

(300, 2)

In [37]:
# Checking for null values

train_df_proper.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   X       700 non-null    object
 1   Y       699 non-null    object
dtypes: object(2)
memory usage: 11.1+ KB


In [40]:
# Removing null-value from train dataset
train_df_proper = train_df_proper.dropna()

# Checking for null values again
train_df_proper.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   X       699 non-null    object
 1   Y       699 non-null    object
dtypes: object(2)
memory usage: 16.4+ KB


In [41]:
test_df_proper.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   X       300 non-null    object
 1   Y       300 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


## Seperating the independent and target variables

In [49]:
x_train = train_df_proper.drop('Y', axis=1)
y_train = train_df_proper['Y']

x_test = test_df_proper.drop('Y', axis=1)
y_test = test_df_proper['Y']


### Training the dataset with the train data

In [52]:
# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(x_train, y_train)


LinearRegression()

### Testing the model with the test data and evaluating the predictions with Mean Squared Error, Mean Absolute Error, R-squared metrics

In [54]:
# Make predictions on the test data
predictions = model.predict(x_test)


In [55]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, predictions)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, predictions)

# Calculate R-squared
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 9.432922192039317
Mean Absolute Error: 2.4157718500412595
R-squared: 0.9888014444327563
