In [1]:
# 24/02/2023

In [2]:
import pandas as pd
import numpy as np
import pickle
import copy

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [3]:
class NingXiang:



    def __init__(self):
        """ Initialise class """
    
        self._initialise_objects()

        print('NingXiang Initialised')



    def _initialise_objects(self):
        
        self._seed = 18981124
        self.train_x = None
        self.train_y = None
        self.clf_type = None
        self.ningxiang_output = None
        self.object_saving_address = None
        
    
    
    def read_in_train_data(self, train_x, train_y):
        """ Reads in train data for building NingXiang output object """

        self.train_x = train_x
        print("Read in Train X data")

        self.train_y = train_y
        print("Read in Train y data")



    def set_model_type(self, type):
        """ Reads in underlying model object for tuning, and also read in what type of model it is """

        assert type == 'Classification' or type == 'Regression' # check

        self.clf_type = type 

        print(f'Successfully recorded model type: {self.clf_type}')



    def get_lr_based_feature_combinations(self, min_features = 0):
        """ Get NingXiang scores based on LR feature importance """

        if self.clf_type == None:
            print('clf_type not found, please run set_model_type() first')
            return
        
        if self.clf_type == 'Classification':
            print(".get_lr_based_combinations() only works for continuous labels")
            return

        if self.train_x is None or self.train_y is None:
            print('train_x and train_y not found, please run read_in_train_data')
            return

        self.ningxiang_output = dict()

        curr_combo = list()
        remaining_features = list(self.train_x.columns)
        for i in range(len(remaining_features)):
            print(f'Up to {i+1}th variable')

            best_score = 0
            best_combo = None
            
            # try adding each new feature and getting lr
            for feature in remaining_features:
                tmp_combo = copy.deepcopy(curr_combo)
                tmp_combo.append(feature)

                lr = LinearRegression()
                lr.fit(self.train_x[tmp_combo], self.train_y)

                score = lr.score(self.train_x[tmp_combo], self.train_y)
                
                # if is new max of this round, then update
                if score > best_score:
                    added_feature = feature
                    best_score = score
                    best_combo = copy.deepcopy(tmp_combo)
            
            curr_combo = copy.deepcopy(best_combo)
            remaining_features.remove(added_feature) # remove added feature
            print(f'Current combination: {curr_combo}')
            print(f'Best score: {np.sqrt(best_score)}\n')
            
            if i+1 >= min_features:
                # store in ningxiang output
                self.ningxiang_output[tuple(curr_combo)] = np.sqrt(best_score)
        
        return self.ningxiang_output

    

    def get_rf_based_feature_combinations(self, min_features = 0):
        """ Gets NingXiang scores based on RF feature importance """
        
        if self.clf_type == None:
            print('clf_type not found, please run set_model_type() first')
            return

        if self.train_x is None or self.train_y is None:
            print('train_x and train_y not found, please run read_in_train_data')
            return
        

        # Initialise the Random Forest objects
        if self.clf_type == 'Regression':
            rf = RandomForestRegressor(n_estimators = 100, max_depth = 12, max_features = 0.75, random_state = self._seed, ccp_alpha = 0, max_samples = 0.75)
        elif self.clf_type == 'Classification':
            rf = RandomForestClassifier(n_estimators = 100, max_depth = 12, max_features = 0.75, random_state = self._seed, ccp_alpha = 0, max_samples = 0.75)
        
        print('Begin fitting Random Forest')
        # fit the model and get the feature importances
        rf.fit(self.train_x, self.train_y)
        print('Finished fitting Random Forest')
        feature_importance = {self.train_x.columns[i]:rf.feature_importances_[i] for i in range(len(self.train_x.columns))}

        # use handle (which can be used on its own) to generate the ningxiang output
        self.ningxiang_output = self.get_rf_based_feature_combinations_from_feature_importance(feature_importance, min_features)

        return self.ningxiang_output
    


    def get_rf_based_feature_combinations_from_feature_importance(self, feature_importance, min_features = 0):
        """ Takes in a dictionary of features:importance and returns feature importance"""
        
        # sort features by feature importance (reversed order)
        feature_importance_sorted = self._sort_dict_reverse(feature_importance)
        # get the features in the output format that can be linked up with other JiaXing packages
        self.ningxiang_output = self._get_ningxiang_rf_output(feature_importance_sorted, min_features)

        return self.ningxiang_output
        
    
    
    def _sort_dict_reverse(self, features):
        """ Helper to sort dictionary"""

        features_list = [(key, features[key]) for key in features]
        features_list.sort(key=lambda x:x[1], reverse=True)

        features_reverse_sorted = {x[0]:x[1] for x in features_list}

        return features_reverse_sorted
    


    def _get_ningxiang_rf_output(self, features_reverse_sorted, min_features):
        """ Helper to get rf feature importance into ningxiang output format """

        out = dict() # NingXiang output object must be a dict

        feature_combo = list()
        score = 0

        i = 0
        # Continuously add feature and its score
        for feature in features_reverse_sorted:
            feature_combo.append(feature)
            score += features_reverse_sorted[feature]

            combo = tuple(feature_combo)

            if i+1 >= min_features:
                out[combo] = score

            i += 1

        return out



    def _set_object_saving_address(self, address):
        """ Read in where to save the PuDong object """

        self.object_saving_address = address
        print('Successfully set object output address')

    

    def export_ningxiang_output(self, address):
        """ Export NingXiang's output object  """

        self._set_object_saving_address(address)

        # Export
        object_saving_address_split = self.object_saving_address.split('.pickle')[0]

        with open(f'{object_saving_address_split}.pickle', 'wb') as f:
            pickle.dump(self.ningxiang_output, f)

In [4]:
train_data = pd.read_csv('../data/curated/modelling/N_Train_3.csv')

In [5]:
train_x = train_data.drop(['3'], axis=1)
train_y = train_data['3']

In [6]:
ningxiang = NingXiang()

NingXiang Initialised


In [7]:
ningxiang.read_in_train_data(train_x, train_y)

Read in Train X data
Read in Train y data


In [8]:
ningxiang.set_model_type('Regression')

Successfully recorded model type: Regression


In [10]:
ningxiang.get_rf_based_feature_combinations(2)

Begin fitting Random Forest
Finished fitting Random Forest


{('Disposals BTN', 'Goals BTN'): 0.21288734108805263,
 ('Disposals BTN', 'Goals BTN', 'Disposals OTN'): 0.2506092197240697,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN'): 0.2877890639702431,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN'): 0.3226766604345473,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involvements BTN'): 0.35715385939813754,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involvements BTN',
  'Inside 50s BTN'): 0.38914056715539597,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involvements BTN',
  'Inside 50s BTN',
  'Contested Possessions BTN'): 0.42057906566349346,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involvements BTN',
  'Inside 50s BTN',
  'Contested

In [11]:
ningxiang.get_lr_based_feature_combinations(3)

Up to 1th variable
Current combination: ['Disposals BTN']
Best score: 0.218210154503265

Up to 2th variable
Current combination: ['Disposals BTN', 'Goals BTN']
Best score: 0.25852187622399064

Up to 3th variable
Current combination: ['Disposals BTN', 'Goals BTN', 'Goals OTN']
Best score: 0.267958934526437

Up to 4th variable
Current combination: ['Disposals BTN', 'Goals BTN', 'Goals OTN', 'Clearances BTN']
Best score: 0.2771774187226342

Up to 5th variable
Current combination: ['Disposals BTN', 'Goals BTN', 'Goals OTN', 'Clearances BTN', 'Uncontested Possessions OTN']
Best score: 0.27795250154096285

Up to 6th variable
Current combination: ['Disposals BTN', 'Goals BTN', 'Goals OTN', 'Clearances BTN', 'Uncontested Possessions OTN', 'Handballs OTN']
Best score: 0.27840596652960686

Up to 7th variable
Current combination: ['Disposals BTN', 'Goals BTN', 'Goals OTN', 'Clearances BTN', 'Uncontested Possessions OTN', 'Handballs OTN', 'Ineffective Disposals OTN']
Best score: 0.2788152394007767

{('Disposals BTN', 'Goals BTN', 'Goals OTN'): 0.267958934526437,
 ('Disposals BTN',
  'Goals BTN',
  'Goals OTN',
  'Clearances BTN'): 0.2771774187226342,
 ('Disposals BTN',
  'Goals BTN',
  'Goals OTN',
  'Clearances BTN',
  'Uncontested Possessions OTN'): 0.27795250154096285,
 ('Disposals BTN',
  'Goals BTN',
  'Goals OTN',
  'Clearances BTN',
  'Uncontested Possessions OTN',
  'Handballs OTN'): 0.27840596652960686,
 ('Disposals BTN',
  'Goals BTN',
  'Goals OTN',
  'Clearances BTN',
  'Uncontested Possessions OTN',
  'Handballs OTN',
  'Ineffective Disposals OTN'): 0.27881523940077674,
 ('Disposals BTN',
  'Goals BTN',
  'Goals OTN',
  'Clearances BTN',
  'Uncontested Possessions OTN',
  'Handballs OTN',
  'Ineffective Disposals OTN',
  'Contested Possessions BTN'): 0.27925722157284716,
 ('Disposals BTN',
  'Goals BTN',
  'Goals OTN',
  'Clearances BTN',
  'Uncontested Possessions OTN',
  'Handballs OTN',
  'Ineffective Disposals OTN',
  'Contested Possessions BTN',
  'Contested Pos