# Script 5. Training and collecting statistics from Linear Regression Models using PCA #
## For Brownlow Predictor Project ##

Trains and collects statistics from 4000 different LR Models using Principal Componant Analysis for Brownlow Predicting

Different models arise from the permutations of choices one can make when training models. For this case they are:
- [x] 5 Data Manipulation Types
- [x] 4 Macro Rules of Feature Selection 
- [x] 2 Feature Selection Coefficient Cutoff Values
- [x] 10 Values for Number of Rows to keep in PCA
- [x] 4 Micro Rules of Feature Selection
- [x] 2 Whether to include Winloss in columns
- [x] (5 Folds of Train-Test Split)


**Author: `Lang (Ron) Chen` 2021.12-2022.1**

___

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.decomposition import PCA

from BrownlowPredictorPCA.predict import predict_mass
from BrownlowPredictorTools.test import test_mass
from BrownlowPredictorTools.return_tp import return_tp
from BrownlowPredictorPCA.wholeseason import wholeseason

In [2]:
Central_Statistics = pd.read_csv('Central_StatisticsPCA.csv')

**1. Using Loops to simulate permutations**

*Makes clever use of f-strings to input and output the desired data*

*As commenting is much the same as other scripts, many parts have been left out. Comments only given to significantly different parts*

In [None]:
%%time

choice = {'N': 'NormalisedData', 'S': 'StandardisedData', 'NS': 'NormalisedStandardisedData', 'RS': 'RankStandardisedData', 
          'P': 'PercentageData', 'PN': 'PercentageNormalisedData'}

for dt in ['N', 'S', 'RS', 'P', 'PN']:
    
    filelist = os.listdir(f'./Data/{choice[dt]}')[1:]
    final_test_games = [file for file in filelist if '2021' in file]
    
    for use in ['BT', 'OT', 'BT_OT', 'BT+OT']:
        BT_OT = False
        if use == 'BT_OT':
            BT_OT = True
        
        for FS_val in [0.2, 0.3]: # Use of only two values instead of 3 to save computational time.
            
            for n_PCA in range(1, 11): # 10 Values for Number of Rows to keep in PCA
                
                for winloss in ['In', 'Out']: 
                    
                    for fold in [1, 2, 3, 4, 5]:
                        
                        train_data = pd.read_csv(f'./PreparedData/Train_Data_{fold} ({dt}).csv')
                        
                        test_games = list(pd.read_csv(f'./PreparedData/Test_Games_List_{fold} ({dt}).csv')['Test Games'])
                        
                        if use in ['BT', 'OT']:
                            if winloss:
                                cols = [col for col in train_data.columns if (f'{use}{dt}' in col or 'Winloss' in col)]
                            
                            else:
                                cols = [col for col in train_data.columns if (f'{use}{dt}' in col)]
                                
                        else:
                            if winloss:
                                cols = [col for col in train_data.columns if (f'BT{dt}' in col or f'OT{dt}' in col or 'Winloss' in col)]
                            
                            else:
                                cols = [col for col in train_data.columns if (f'BT{dt}' in col or f'OT{dt}' in col)]
                        
                        corr = dict()
                        for col in cols:
                            corr[col] = train_data[[col, 'Brownlow Votes']].corr(method = 'pearson').loc[col]['Brownlow Votes']

                        corr = list(corr.items())
                        
                        selected_features = [col[0] for col in corr if col[1] > FS_val]
                
                        cent_storage_cols = {'Method': ['LR'], 'Datatype': [dt], 'Use': [use], 'Feature Selection Value': [FS_val], 
                                             'n_PCA': [n_PCA], 'Winloss': [winloss], 'Fold': [fold], 'TP0': [None], 
                                             'TP0.5': [None], 'TP1': [None], 'TP2': [None], 'TP3': [None], 'Coef1': [None], 'Coef2': [None],
                                            'P1': [None], 'V1': [None],
                                            'P2': [None], 'V2': [None],
                                            'P3': [None], 'V3': [None],
                                            'P4': [None], 'V4': [None],
                                            'P5': [None], 'V5': [None],
                                            'P6': [None], 'V6': [None],
                                            'P7': [None], 'V7': [None],
                                            'P8': [None], 'V8': [None],
                                            'P9': [None], 'V9': [None],
                                            'P10': [None], 'V10': [None],
                                            'P11': [None], 'V11': [None],
                                            'P12': [None], 'V12': [None],
                                            'P13': [None], 'V13': [None],
                                            'P14': [None], 'V14': [None],
                                            'P15': [None], 'V15': [None],
                                            'P16': [None], 'V16': [None],
                                            'P17': [None], 'V17': [None],
                                            'P18': [None], 'V18': [None],
                                            'P19': [None], 'V19': [None],
                                            'P20': [None], 'V20': [None]}
                        
                        if len(selected_features) < n_PCA:
                            Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))
                            continue
                        
                        # initiates pca object and uses it.
                        pca = PCA(n_components = n_PCA)
                        
                        traindata_x = train_data[selected_features]
                        traindata_x = traindata_x.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
                        traindata_x.index = range(0,len(traindata_x))
                        principalComponents = pca.fit_transform(traindata_x)
                        traindata_y = train_data['Brownlow Votes']
                        traindata_y.index = range(0,len(traindata_y))
                        
                        lm = linear_model.LinearRegression()
                        model = lm.fit(principalComponents, traindata_y)
                        
                        out = predict_mass(test_games, lm, selected_features, choice[dt], pca)
                        
                        tp = test_mass(out)
                        
                        leaderboard = wholeseason(final_test_games, lm, selected_features, choice[dt], pca)
                        
                        pears_co = lm.score(principalComponents, traindata_y)
                        
                        
                        cent_storage_cols = {'Method': ['LR'], 'Datatype': [dt], 'Use': [use], 'Feature Selection Value': [FS_val], 
                                             'n_PCA': [n_PCA], 'Winloss': [winloss], 'Fold': [fold], 'TP0': [tp[0]], 
                                             'TP0.5': [None], 'TP1': [tp[1]], 'TP2': [tp[2]], 'TP3': [tp[3]], 'Coef1': [pears_co], 'Coef2': [None],
                                            'P1': list(), 'V1': list(),
                                            'P2': list(), 'V2': list(),
                                            'P3': list(), 'V3': list(),
                                            'P4': list(), 'V4': list(),
                                            'P5': list(), 'V5': list(),
                                            'P6': list(), 'V6': list(),
                                            'P7': list(), 'V7': list(),
                                            'P8': list(), 'V8': list(),
                                            'P9': list(), 'V9': list(),
                                            'P10': list(), 'V10': list(),
                                            'P11': list(), 'V11': list(),
                                            'P12': list(), 'V12': list(),
                                            'P13': list(), 'V13': list(),
                                            'P14': list(), 'V14': list(),
                                            'P15': list(), 'V15': list(),
                                            'P16': list(), 'V16': list(),
                                            'P17': list(), 'V17': list(),
                                            'P18': list(), 'V18': list(),
                                            'P19': list(), 'V19': list(),
                                            'P20': list(), 'V20': list()}
                        
                        for i in range(1, 21):
                            cent_storage_cols[f'P{i}'].append(leaderboard[i-1][0])
                            cent_storage_cols[f'V{i}'].append(leaderboard[i-1][1])
                        
                        
                        Central_Statistics = Central_Statistics.append(pd.DataFrame(cent_storage_cols))
                        
                    Central_Statistics.to_csv('Central_StatisticsPCA.csv', index = None)