In [None]:
import numpy as np
import pandas as pd
import csv
import math

#response y is crime rate
nFeatures = 95
nTrain = 1595 #samples
nTest = 399 #samples

def RSME(pred, actual):
    total = 0.0
    n = len(actual)
    for i in range(n): #samples
        total += ((pred[i] - actual[i][0])**2)
    return math.sqrt(total/n)

#Takes NxP matrix and returns predicted Nx1 array
#get the prediction model first
def problem1(samples):
    samples1 = np.array(samples)
    n = len(samples)
    Y = []
    X = []
    for i in range(n):
        Y.append([samples1[i][0]])
        X.append(np.transpose(samples1[i]))

    Y = np.array(Y)
    X = np.array(X)
    xT = np.transpose(X)
    theta = np.linalg.inv(xT @ X) @ (xT @ Y)
    thetaT = np.transpose(theta)

    results = []
    for i in range(n):
        results.append(thetaT @ X[i])
    results = np.array(results)
    return results

def problem2(samples):
    n = 96
    pred = problem1(samples)
    initial = RSME(pred, samples)
    removed = []
    rsmeValues = []
    count = 1
    while count < n:
        test = samples
        removed.append(count)
        test = np.delete(test, removed, 1)

        pred = problem1(test)
        if (RSME(pred, test) < initial):
            initial = RSME(pred, test)
            rsmeValues.append(initial)
        else:
            removed.pop()
        count += 1

    return removed, rsmeValues #returns indicies of features that were removed


trainData = []
testData = []
with open ('crime-test.txt', 'r') as file:
    reader = csv.reader(file, delimiter = '\t')
    testData = list(reader)[1:nTest]
with open ('crime-train.txt', 'r') as file:
    reader = csv.reader(file, delimiter = '\t')
    trainData = list(reader)[1:nTrain]
    
with open ('crime-test.txt', 'r') as file:
    reader = csv.reader(file, delimiter = '\t')
    headers = list(reader)[0]

testDataf = np.array(testData)
trainDataf = np.array(trainData)
testDataf = testDataf.astype(np.float64)
trainDataf = trainDataf.astype(np.float64)

#RSME values
print("RSME for test: ")
print(RSME(problem1(testDataf), testDataf))
print("RSME for train: ")
print(RSME(problem1(trainDataf), testDataf))

#Removed features and RSME values for test
headersTest = []
print("Removed features for test: ")
removedTest = problem2(testDataf)
for i in removedTest[0]:
    headersTest.append(headers[i])
print(headersTest)
print("RSME values at each removal: ")
print(removedTest[1])

#Removed features and RSME values for train
headersTrain = []
print("Removed features for train: ")
removedTrain = problem2(trainDataf)
for i in removedTrain[0]:
    headersTrain.append(headers[i])
print(headersTrain)
print("RSME values at each removal: ")
print(removedTrain[1])

RSME for test data: 
1.1060914201937156e-13

RSME for train data: 
0.34843259948218247

Removed features for test data: 
['medIncome', 'medFamInc', 'TotalPctDiv', 'PctIlleg', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctUsePubTrans']

RSME values at each removal in test data: 
[1.0115333972708671e-13, 6.304646033689166e-14, 4.9413451899105155e-14, 4.126339697272972e-14, 3.9342845366727095e-14, 3.337244033960891e-14, 2.4001702986696288e-14, 2.147363385721029e-14]

Removed features for train data: 
['agePct65up', 'whitePerCap', 'AsianPerCap', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctSameCity85']

RSME values at each removal in train data: 
[5.066619487712679e-14, 4.782368814383866e-14, 4.39608856891003e-14, 4.162805422321394e-14, 2.895067011324492e-14, 2.7385421243243624e-14]