In [1]:
### Import File ###
import csv
import numpy as np

""" 
Column Names: 
['Date', 'Name', 'Team', 'Season', 'Round',
    'Home Team', 'Away Team', 'Home Score', 'Away Score', 'Margin',
    'Disposals', 'Kicks', 'Marks', 'Handballs', 'Goals',
    'Behinds', 'Hitouts', 'Tackles', 'Rebounds', 'Inside 50s',
    'Clearances', 'Clangers', 'Frees For', 'Frees Against', 'Contested Pos',
    'Uncontested Pos', 'Contested Marks', 'Marks Inside 50', 'One Percenters', 'Goal Assists',
    'Brownlow Votes', 'TOG']
"""
with open("RawData.csv") as csvFile:
    csvData = csv.reader(csvFile)
    
    headRow = next(csvData)
    indVar=[i for i in range(7,30)] # Columns of independent variables
    trnList  = [] # Train data
    tstList = [] # Test data
    
    for row in csvData:
        year = int(float(row[3]))
        
        # Training set
        if year < 2016:
            trnList.append([])
            for i in range(0,len(row)):
                if i==3: # Year
                    trnList[-1].append( round( float( (row[i]) ) ) )
                elif 7 <= i <= 30: # Integer Data
                    trnList[-1].append( float( (row[i]) ) ) 
                else: # String data
                    trnList[-1].append(str(row[i]))
        
        # Testing set
        else:   
            tstList.append([])
            for i in range(0,len(row)):
                if i==3: # Year
                    tstList[-1].append( round( float( (row[i]) ) ) )
                elif 7 <= i <= 30: # Integer Data
                    tstList[-1].append( float( (row[i]) ) ) 
                else: # String data
                    tstList[-1].append( str( row[i] ) )
                    
trnAr = np.zeros( (len(trnList), len(indVar)) )
trnDep = np.array( [trnList[i][30] for i in range(0,len(trnList))] ) # Dependent variable (votes)

tstAr = np.zeros( (len(tstList), len(indVar)) )
tstDep = np.array( [tstList[i][30] for i in range(0,len(tstList))] ) 

k = 0 # Column counter in arrays
for j in indVar:
    for i in range(0,len(trnList)):
        trnAr[i,k] = float(trnList[i][j])
    for i in range(0,len(tstList)):
        tstAr[i,k] = float(tstList[i][j])
    k += 1

In [2]:
### Standardise data ###
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(trnAr)
trnAr = scaler.transform(trnAr)

scaler.fit(tstAr)
tstAr = scaler.transform(tstAr)

In [3]:
### Ridge Regression ###
from sklearn.linear_model import Ridge

ridgeModel = Ridge().fit(trnAr,trnDep)
ridgePred = ridgeModel.predict(tstAr)

error = [abs(tstDep[i] - ridgePred[i]) for i in range(0,tstDep.shape[0])]
print(sum(error)/len(error)) # Mean of absolute errors

0.2517347750420597


In [4]:
### K-Nearest Neighbours ###
from sklearn.neighbors import KNeighborsRegressor

knnModel = KNeighborsRegressor(n_neighbors=5,algorithm='kd_tree',weights='distance').fit(trnAr,trnDep)
knnPred = knnModel.predict(tstAr)

error = [abs(tstDep[i] - knnPred[i]) for i in range(0,tstDep.shape[0])]
print(sum(error)/len(error)) # Mean of absolute errors

0.1422216672654402


In [5]:
### Cross Decomposition ####
from sklearn.cross_decomposition import PLSRegression

plsModel = PLSRegression().fit(trnAr,trnDep)
plsPred = plsModel.predict(tstAr)

error = [abs(tstDep[i] - plsPred[i][0]) for i in range(0,tstDep.shape[0])]
print(sum(error)/len(error)) # Mean of absolute errors

0.2515727170152827
