# Linear Regression OOP
Python Automation; Scott Schmidt; Illinois State University with Dr. Tang

Using OOP and linear regression without the use of imports ensures an individual will gain a full understanding of both Python and statistics. 

## Math Functions
Three functions called sumList(), meanList(), and stdevList() will be used to calculate the mean and standard deviation of any given list. Each function has a list as an input, and returns the corresponding result.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error

#Return the total:
def sumList(numbers):
    count=0
    for num in numbers:
        count=count+float(num)
    total=round(count,2)
    return total

#Return the mean:
def meanList(numbers):
    numbers=list(numbers)
    total=sumList(numbers)
    length=len(numbers)
    mean=total/length
    return round(mean,2)

# Calculate the variance of a list of numbers:
def varianceList(aList):
    # calculate mean
    m = meanList(aList)

    # calculate variance using a list comprehension
    var = sum((xi - m) ** 2 for xi in aList) / len(aList)
    return var

#Calculate the covariance of two lists of numbers:
def covarianceList(x, y):
    # Finding the mean of the series x and y
    mean_x = meanList(x)
    mean_y = meanList(y)
    
    # Subtracting mean from the individual elements
    sub_x = [i - mean_x for i in x]
    sub_y = [i - mean_y for i in y]
    
    numerator = sum([sub_x[i]*sub_y[i] for i in range(len(sub_x))])
    denominator = len(x)-1
    cov = numerator/denominator
    return cov

# LinearRegression
The expected output should be: "The coefficients b0 is ###;  b1 is ###,  RMSE is ###

In [2]:
class MyLinearRegression:
    def __init__(self):
        self.coef= None # b1
        self.intercept = None #b0
        
    def loadData(self, file):
        df = pd.read_csv(file, header=None, names=['x', 'y'])
        self.df=df
        print("loading data")
        return df
        
    def split(self, data):
        x=list(data['x'])
        y=list(data['y'])
        self.x=x
        self.y=y
        print("Split data into x and y")
        return x,y
    
    def train_test_split(self, x, y, ratio):
        length=len(x)
        middle = int(length * ratio)
        X_test=[x[:middle]]
        y_test=[y[:middle]]
        X_train=[x[middle:]]
        y_train=[y[middle:]]
        self.X_train= X_train
        self.X_test= X_test
        self.y_train=y_train
        self.y_test=y_test
        print("Split into x_train x_test y_train y_test")
        return X_train, X_test, y_train, y_test
          
    def fit(self, x, y):
        x=x[0]
        y=y[0]
        w1 = covarianceList(x, y) / varianceList(x)
        w0 = meanList(y) - (w1 * meanList(x))
        self.intercept=w1
        self.coef=w0
        print("Fit data. coef is: ", w0, " and intercept is: ", w1)

    def predict(self, x_test):
        coef=self.coef
        x_new=np.array([x_test])
        predictions = np.round_(self.intercept + x_new*self.coef, decimals=2)
        self.predictions=predictions
        print("Predictions made: ", predictions)
        return predictions

    def evalRMSE(self, predictions, y_test):
        diff=np.subtract(predictions,y_test)
        square=np.square(diff)
        MSE=square.mean()
        RMSE=np.sqrt(MSE)
        RMSE=np.round_(RMSE, decimals=2)
        return RMSE

if __name__ == '__main__':
    print("Starting Linear Regression Program.")
    lr=MyLinearRegression()
    file=lr.loadData(r'C:\Users\sschm\Desktop\IT170\salary.csv')  #file=lr.loadData(r'/kaggle/input/salary/salary.csv') 
    data=lr.split(file)
    x=data[0]
    y=data[1]
    X_train, X_test, y_train, y_test=lr.train_test_split(x, y, .25)
    lr.fit(X_train, y_train)
    predictions=lr.predict(X_test)
    rmse=lr.evalRMSE(predictions, y_test)
    print("RMSE is ", rmse)

Starting Linear Regression Program.
loading data
Split data into x and y
Split into x_train x_test y_train y_test
Fit data. coef is:  20310.453335807608  and intercept is:  10246.716244334239
Predictions made:  [[[32588.21 36650.31 40712.4  50867.62 54929.71 69147.03 71178.08]]]
RMSE is  10027.67


## References
1. https://medium.com/analytics-vidhya/oop-machinelearning-powerful-a9b936a8db48