In [32]:
import numpy as np 

# Fit Linear Model for Given Data: Ax = b, where b is profits of the superstore from that customer/purchase
# Least Squares Solution
# May 15, 2022

# x vector: x1 = number of years after 2014, x2 = state (stored as an integer from 1 to 50- in alphabetical order (ie Alabama is 1))
# x3 = Category (1 if furniture, 2 if office supplies, 3 if technology), x4 = sales, x5 = product quantity, x6 = discount
# there are 7 feature functions, where the first feature function is a constant term, 
# and the remaining 6 are related to x1, x2, x3, x4, x5, and x6

# Function that finds the sum of squares of the difference between y and y_hat
def findError(coordinates, yVal):
    dimVal = np.shape(coordinates)
    sumError = 0
    for i in range(dimVal[0]):
        squaredDiff = (coordinates[i] - yVal[i]) * (coordinates[i] - yVal[i])
        sumError = sumError + squaredDiff
        
    return sumError

# load processed data -- data that converted the states and categories to respective integers 
data = np.loadtxt(open("Processed_Data.csv"), delimiter=",")

# profits is last column of that data 
profits = data[:, 6]

# for first feature function 
constColumn = np.ones((np.shape(data)[0], 1))

# matrix of feature functions (overconstrained system)
A = np.column_stack((constColumn, data[:, :6]))

A_T_A = A.T @ A
A_T_P = A.T @ profits

# solve for the uknowns in the now fully constrained system 
solution = np.linalg.solve(A_T_A, A_T_P)

# took last 102 data points as testing data and used it to find the mean-squared error 
testing_data = np.loadtxt(open("Testing_Data.csv"), delimiter=",")

testingConstCol = np.ones((np.shape(testing_data)[0], 1))
B = np.column_stack((testingConstCol, testing_data[:, :6]))

predictive_profits = B @ solution

square_sum = findError(predictive_profits, testing_data[:, 6])
print("mean-squared error is: ", square_sum/(np.shape(testing_data)[0]))

## Given large mean squared error a linear predictive model is most likely not the best fit for the data given
## Future Steps: Try Different Polynomial Fits and Modify Feature Functions To Account for Only Category, Quantity, and Discount 

mean-squared error is:  6332.481506823116
