# VALLIAPPAN V

# A Project for an internship at "The Sparks Foundation"

# Task1 - Prediction Using Supervised Machine Learning

# Batch - June 2023

# Importing necessary modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Loading and displaying the dataset

In [None]:
url = "http://bit.ly/w-data"
data = pd.read_csv(url)
print("size:",data.size,"; shape",data.shape)
data.head()

# Plotting the distribution of scores

In [None]:
data.plot(x='Hours', y='Scores', style='+', color='red')
plt.title('Hours vs Scores')
plt.xlabel('Hours taken')
plt.ylabel('Score')
plt.show()

# Implementing Linear Regression from Scratch

In [None]:
class LinearRegression:
    def fit(self,X,y):
        m = X.shape[0]
        X_mean, y_mean = np.mean(X), np.mean(y)
        X_mean_diff, y_mean_diff = X-X_mean, y-y_mean
        self.b1 = (X_mean_diff @ y_mean_diff) / (X_mean_diff @ X_mean_diff)
        self.b0 = y_mean - (self.b1 * X_mean)
        print(f"(b0,b1):({self.b0:.3f},{self.b1:.3f})")
        return self
    
    def predict(self,X):
        return self.b0 + X*self.b1

    def evaluate(self,X,y):
        y_pred = self.predict(X)
        y_diff,y_mean_diff  = y-y_pred , y-np.mean(y)
        rmse = np.sqrt(y_diff @ y_diff/X.shape[0])
        ss_tot = y_mean_diff @ y_mean_diff
        ss_res = y_diff @ y_diff
        r2 = 1 - ss_res/ss_tot
        print("Root mean squared Error:",rmse)
        print("R^2 value:",r2)
     

# Regression Plot Function

In [None]:
def regression_plot(X,y,model,title=""):
    plt.figure(figsize=(14,7))
    plt.title(title)
    plt.xlim(1,10)
    plt.ylim(10,100)
    plt.xlabel('Hours taken')
    plt.ylabel('Score')
    
    x_line = np.array([np.min(X) - 100,np.max(X) + 100]).reshape(-1,1)
    y_line = model.predict(x_line)
    
    plt.scatter(X, y, color='red', label='Original Data Points')
    plt.plot(x_line, y_line, linewidth=2, label='Regression Line')
    plt.legend()

# Training the model and fitting the best fit line

In [None]:
X = data['Hours'].values
y = data['Scores'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
   

lin_reg_model= LinearRegression()
lin_reg_model.fit(X_train,y_train)
regression_plot(X,y,lin_reg_model,title="Trained Regression model")

# Testing the model

In [None]:
y_pred = lin_reg_model.predict(X_test)
testing = pd.DataFrame({"Actual":y_test , "Predicted":y_pred})
testing

# Testing the model with study hour '9.25' per day

In [None]:
hours = 9.25
test = lin_reg_model.predict(hours)
print(f"Spending 9.25 hours per day will make a student able to score {test:.3f}% marks.")

# Evaluating the model using "Root Mean Squared Error"

In [None]:
evaluate = lin_reg_model.evaluate(X,y)

# Thank you