# Linear Regression OOP
Python Automation; Scott Schmidt; Illinois State University

The main purpose of this assignment is to do an OOP linear regression without the use of imports so that one gains a full understanding of statistics and Python.

## Math Functions
Three functions called sumList(), meanList(), and stdevList() will be used to calculate the mean and standard deviation of any given list. Each function has a list as an input, and returns the corresponding result.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error

#Return the total:
def sumList(numbers):
    count=0
    numbers=list(numbers)
    for n in numbers:
        count=count+float(n)
    total=round(count,2)
    return total

#Return the mean:
def meanList(numbers):
    numbers=list(numbers)
    total=sumList(numbers)
    length=len(numbers)
    mean=total/length
    return round(mean,2)

# Calculate the variance of a list of numbers:
def varianceList(aList):
    # calculate mean
    m = meanList(aList)

    # calculate variance using a list comprehension
    var = sum((xi - m) ** 2 for xi in aList) / len(aList)
    return var

#Calculate the covariance of two lists of numbers:
def covarianceList(x, y):
    # Finding the mean of the series x and y
    mean_x = meanList(x)
    mean_y = meanList(y)
    
    # Subtracting mean from the individual elements
    sub_x = [i - mean_x for i in x]
    sub_y = [i - mean_y for i in y]
    
    numerator = sum([sub_x[i]*sub_y[i] for i in range(len(sub_x))])
    denominator = len(x)-1
    cov = numerator/denominator
    return cov

def loadData(file):
    df = pd.read_csv(file, header=None, names=['years', 'salary'])
    return df

# LinearRegression
The expected output should be: "The coefficients b0 is ###;  b1 is ###,  RMSE is ###

In [None]:
class MyLinearRegression:
    def __init__(self, dataList):
        self.coef= None # b1
        self.intercept = None #b0
        self.dataList = [] 

    
    def dataSplit(self, dataList, ratio):
        self.train=[]
        self.test=[]
        elements = len(dataList)
        middle = int(elements * ratio)
        trainSet=[dataList[:middle]]
        testSet=[dataList[middle:]]
        self.train=trainSet
        self.test=testSet
          
    def fit(self, trainingSet):
        '''
        The input is a list with each item a list [x, y]
        Output coef and intercept
        
        Arguments:
        trainingSet: testSet: 1D or 2D numpy array
        '''
        x=list(trainingSet[0])
        y=list(trainingSet[1])

        w1 = covarianceList(x, y) / float(varianceList(x))
        w0 = meanList(y) - (w1 * meanList(x))
        self.intercept=w1
        self.coef=w0
    
    def predict(self, x_test):
        '''
        Output model predition
        
        Arguments:
        testSet: 1D or 2D numpy array
        '''
        x_new=np.array([x_test])
        predictions = np.round_(self.intercept + x_new*self.coef, decimals=2)
        return predictions
    
    def evalRMSE(self, predictions, targets):
        diff=np.subtract(predictions,targets)
        square=np.square(diff)
        MSE=square.mean()
        RMSE=np.sqrt(MSE)
        RMSE=np.round_(RSME, decimals=2)
        return RMSE

#file=r'C:\Users\sschm\Desktop\IT170\salary.csv'
file=r'/kaggle/input/salary/salary.csv'
data=loadData(file)
print(data)
lr=MyLinearRegression()

## References
1. https://medium.com/analytics-vidhya/oop-machinelearning-powerful-a9b936a8db48