In [10]:
# 17/02/2023

In [11]:
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


In [12]:
class NingXiang:



    def __init__(self):
        """ Initialise class """
    
        self._initialise_objects()

        print('NingXiang Initialised')



    def _initialise_objects(self):
        
        self._seed = 18981124
        self.train_x = None
        self.train_y = None
        self.clf_type = None
        self.ningxiang_output = None
        self.object_saving_address = None
        
    
    
    def read_in_train_data(self, train_x, train_y):
        """ Reads in train data for building NingXiang output object """

        self.train_x = train_x
        print("Read in Train X data")

        self.train_y = train_y
        print("Read in Train y data")



    def set_model_type(self, type):
        """ Reads in underlying model object for tuning, and also read in what type of model it is """

        assert type == 'Classification' or type == 'Regression' # check

        self.clf_type = type 

        print(f'Successfully recorded model type, which is a {self.clf_type} model')

    

    def get_rf_based_feature_combinations(self):
        """ Gets NingXiang scores based on RF feature importance """
        
        if self.clf_type == None:
            print('clf_type not found, please run set_model_type() first')
            return

        if self.train_x is None or self.train_y is None:
            print('train_x and train_y not found, please run read_in_train_data')
            return
        

        # Initialise the Random Forest objects
        if self.clf_type == 'Regression':
            rf = RandomForestRegressor(n_estimators = 100, max_depth = 12, max_features = 0.75, random_state = self._seed, ccp_alpha = 0, max_samples = 0.75)
        elif self.clf_type == 'Classification':
            rf = RandomForestClassifier(n_estimators = 100, max_depth = 12, max_features = 0.75, random_state = self._seed, ccp_alpha = 0, max_samples = 0.75)
        
        # fit the model and get the feature importances
        rf.fit(self.train_x, self.train_y)
        feature_importance = {self.train_x.columns[i]:rf.feature_importances_[i] for i in range(len(self.train_x.columns))}

        # sort features by feature importance (reversed order)
        feature_importance_sorted = self._sort_dict_reverse(feature_importance)
        # get the features in the output format that can be linked up with other JiaXing packages
        self.ningxiang_output = self._get_ningxiang_rf_output(feature_importance_sorted)

        return self.ningxiang_output
        
    
    
    def _sort_dict_reverse(self, features):
        """ Helper to sort dictionary"""

        features_list = [(key, features[key]) for key in features]
        features_list.sort(key=lambda x:x[1], reverse=True)

        features_reverse_sorted = {x[0]:x[1] for x in features_list}

        return features_reverse_sorted
    


    def _get_ningxiang_rf_output(self, features_reverse_sorted):
        """ Helper to get rf feature importance into ningxiang output format """

        out = dict() # NingXiang output object must be a dict

        feature_combo = list()
        score = 0

        # Continuously add feature and its score
        for feature in features_reverse_sorted:
            feature_combo.append(feature)
            score += features_reverse_sorted[feature]

            combo = tuple(feature_combo)

            out[combo] = score

        return out



    def _set_object_saving_address(self, address):
        """ Read in where to save the PuDong object """

        self.object_saving_address = address
        print('Successfully set object output address')

    

    def export_ningxiang_output(self, address):
        """ Export NingXiang's output object  """

        self._set_object_saving_address(address)

        # Export
        object_saving_address_split = self.object_saving_address.split('.pickle')[0]

        with open(f'{object_saving_address_split}.pickle', 'wb') as f:
            pickle.dump(self.ningxiang_output, f)

In [13]:
train_data = pd.read_csv('../data/curated/modelling/N_Train_3.csv')

In [14]:
train_x = train_data.drop(['3'], axis=1)
train_y = train_data['3']

In [15]:
ningxiang = NingXiang()

NingXiang Initialised


In [16]:
ningxiang.read_in_train_data(train_x, train_y)

Read in Train X data
Read in Train y data


In [17]:
ningxiang.set_model_type('Regression')

Successfully recorded model type, which is a Regression model


In [18]:
ningxiang.get_rf_based_feature_combinations()

{('Disposals BTN',): 0.12391092852696492,
 ('Disposals BTN', 'Goals BTN'): 0.21288734108805263,
 ('Disposals BTN', 'Goals BTN', 'Disposals OTN'): 0.2506092197240697,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN'): 0.2877890639702431,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN'): 0.3226766604345473,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involvements BTN'): 0.35715385939813754,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involvements BTN',
  'Inside 50s BTN'): 0.38914056715539597,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involvements BTN',
  'Inside 50s BTN',
  'Contested Possessions BTN'): 0.42057906566349346,
 ('Disposals BTN',
  'Goals BTN',
  'Disposals OTN',
  'Goals OTN',
  'Effective Disposals BTN',
  'Score Involveme