In [3]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta, datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [4]:

random.seed(42)
np.random.seed(42)


num_rows = 10000
person_id_range = (1, 3000)  
disease_list = ['Disease_A', 'Disease_B', 'Disease_C', 'Disease_D', 'Disease_E']  
date_range = (datetime(2019, 1, 1), datetime(2023, 12, 31)) 
age_range = (20,80) 
gender_binary = ['FEMALE', 'MALE']

data = {
    'person_id': np.random.randint(*person_id_range, size=num_rows),
    'event_concept_name': np.random.choice(disease_list, size=num_rows),
    'svcdate': [date_range[0] + timedelta(days=random.randint(0, (date_range[1] - date_range[0]).days)) for _ in range(num_rows)],
    'age': np.random.randint(*age_range, size=num_rows),
    'gender_concept_name': np.random.choice(gender_binary, size=num_rows),
}       


df = pd.DataFrame(data)

print(df)  


      person_id event_concept_name    svcdate  age gender_concept_name
0           861          Disease_D 2022-08-02   36                MALE
1          1295          Disease_E 2019-08-17   39              FEMALE
2          1131          Disease_E 2019-02-21   76                MALE
3          1096          Disease_D 2023-02-27   32              FEMALE
4          1639          Disease_B 2020-07-17   63                MALE
...         ...                ...        ...  ...                 ...
9995       1178          Disease_E 2023-07-06   51              FEMALE
9996       1950          Disease_B 2020-02-20   72                MALE
9997        878          Disease_D 2021-05-22   50              FEMALE
9998       1139          Disease_A 2020-11-21   36                MALE
9999       1578          Disease_E 2023-07-20   42              FEMALE

[10000 rows x 5 columns]


In [5]:
class DiseaseTable:
    
    def __init__(self, name):
        self.originalDf = None # initiate the original dataframe as df
        self.df = None  # initiate the transformed dataframe as df 
        self.count = None # initiate disease type count for the df
        self.countEach = None # initiate each disease count for the df
        self.duration = None # initiate duration for the dfff
        self.firstCondition = None # initiate the person's first disease for the df
        self.firstIniQuar = None # initiate the person's firstdisease initial quarter for the df (dummy)
        self.OtherIniQuar = None # initiate the person's firstdisease initial quarter for the df (dummy)
        self.conditionPair = None # initiate the person's condition pairs
        self.lastConditionPair = None # initiate the person's last condition pairs
        self.lastCondition = None # initiate the person's last condition
        self.mergedDf = None # initiate the person's all merged dataframe
        self.name = name # Give the name for this object
        self.ageGenderDf = None # initiate the person's age and gender as df
        
    def loadData(self, data):
        """
        load the data to the object as the original table aka df
        
        args:
        - data(dataframe or other format table): the data which will be added into the object as the
                original table aka df, could be a dataframe, also could be other format and transfer
                to dataframe
        """
        
        if isinstance(data, pd.DataFrame):
            self.originalDf = data
        else:
            self.originalDf = pd.DataFrame(data)
    
    def transformDf(self):
        """
        transformed the original dataframe and store it in the attribute df
        """
        
        # Sort the df by person_id and svcdate to prepare for find the last condition
        dfSorted = self.originalDf.sort_values(by=['person_id', 'svcdate'])
        
        # Keep the first appearance condition rows for each condition for each person
        firstAppearance = dfSorted.groupby(['person_id', 'event_concept_name']).first().reset_index()
        
        # Sort the firstAppearance dataframe by person_id and svcdate
        firstAppearanceSorted = firstAppearance.sort_values(by=['person_id','svcdate'])
        
        # Get a dataframe to show each person's last appearance condition
        lastAppearance = firstAppearanceSorted.groupby('person_id').last().reset_index()
        
        # Use merge function to merge originalDf and lastAppearance and only keep the left_only,
        # which means that the dataframe will only keep the rows except lastAppearance
        mergedDf = pd.merge(self.originalDf, lastAppearance[['person_id', 'event_concept_name']],on=['person_id', 'event_concept_name'], how='left', indicator=True).sort_values(by=['person_id'])
        
        filteredDf = mergedDf[mergedDf['_merge'] == 'left_only'].sort_values(by=['person_id'])
        
        filteredDf = filteredDf.drop(columns=['_merge']).sort_values(by=['person_id'])
        
        self.df = filteredDf
        
    def getAgeGender(self):
        """
        Convert Gender as binary type and convert the age to categorical type
        """
        
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")
        
        #sort the dataframe to by person_id and svcdate
        df_sorted = self.df.sort_values(by=['person_id', 'svcdate']).copy()
        
        first_occurrences = df_sorted.groupby('person_id').first().reset_index()
        
        result = first_occurrences.loc[:, ['person_id', 'age', 'gender_concept_name']]
    
        bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        labels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

        # convert age to categories 
        result['age_category'] = pd.cut(result['age'], bins=bins, labels=labels, right=False)

        # convert gender_concept_name as 0 and 1
        result['gender'] = result['gender_concept_name'].map({'MALE': 0, 'FEMALE': 1})

        result = result.drop(columns=['gender_concept_name'])
        
        result = result.rename(columns={'index': 'person_id'}).set_index('person_id')
        
        age_dummies = pd.get_dummies(result['age_category'], prefix='age')

        result = pd.concat([result, age_dummies], axis=1)
        
        result = result.drop(columns=['age_category','age'])
        
        self.ageGenderDf = result
        
    def countEachType(self):
        """
        Generate everyone's number of each type conditions appearance for the person has in this dataframe
        """
        
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")
        
        stats = self.df.groupby(['person_id', 'event_concept_name']).size().unstack(fill_value=0)
        
        stats.columns = [f'{col}_count' for col in stats.columns]
        
        self.countEach = stats
        
        
    def countTypes(self):
        """
        Generate everyone's number of type conditions the person has in this dataframe
        """
        
        if self.countEach is None:
            raise ValueError("countEach is not loaded. Please load data first using `CountEachType` method.")
        
        diseaseTypeCount = self.countEach.apply(lambda x: (x > 0).sum(), axis=1)
        
        count_df = pd.DataFrame(diseaseTypeCount, columns=['ConditionTypeCount'])
        
        self.count = count_df
        
    def durationCalc(self):
        """
        Generate everyone's duration as the first recording to last recording gap in this dataframe
        """
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        duration_df = self.df.groupby('person_id')['svcdate'].agg(['min', 'max'])

        diseaseDuration = (duration_df['max'] - duration_df['min']).dt.days
        count_duration = pd.DataFrame(diseaseDuration, columns=['duration'])
        
        self.duration = count_duration
        
    
    def firstConditionDummy(self):
        """
        Get everyone's first condition recording.
        """
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        sorted_df = self.df.sort_values(by=['person_id', 'svcdate'])

        first_records = sorted_df.drop_duplicates(subset=['person_id'], keep='first')

        self.firstCondition = pd.get_dummies(first_records.set_index('person_id')['event_concept_name'], prefix='FirstConidtion')
        
        
    def firstConditionQuarterDummy(self):
        """
        set everyone's first condition recording quarter and set it as dummy variables.
        """
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        # sort by person id and svcdate
        sorted_df = self.df.sort_values(by=['person_id', 'svcdate'])

        # get the first condition recording and generate a copy
        first_records = sorted_df.drop_duplicates(subset=['person_id'], keep='first').copy()

        # get every svcdate quarter and give values
        first_records['Quarter'] = first_records['svcdate'].dt.quarter

        # create quarters dummy variables
        quarter_dummies = pd.get_dummies(first_records['Quarter'], prefix='FirstDiseaseQuarter')

        # set person_id as the index of the dummy variabls
        self.firstIniQuar = quarter_dummies.set_index(first_records['person_id'])
        

    def conditionQuarterDummies(self):
        
        """
        Create a every condition quarter recording as a dummy variables
        """
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        final_dummies = pd.DataFrame(index=self.df['person_id'].unique())

        diseases = self.df['event_concept_name'].unique()

        for disease in diseases:
            # give a copy for the disease df
            disease_df = self.df[self.df['event_concept_name'] == disease].copy()

            # get the quarter of the disease recording happened in 
            disease_df['Quarter'] = disease_df['svcdate'].dt.quarter

            # get the dummy for everyone's every conditions
            quarter_dummies = pd.get_dummies(disease_df.set_index('person_id')['Quarter'], prefix=f'{disease}_Quarter')

            # combination the dummy variables with the final DataFrame 
            final_dummies = final_dummies.join(quarter_dummies, how='left')

        # Convert the NaN value to False
        final_dummies.fillna(False, inplace=True)

        # Reset index by 'person_id'
        final_dummies.reset_index(inplace=True)
        final_dummies.rename(columns={'index': 'person_id'}, inplace=True)

        # Only keep the max as 1 in the df
        final_consolidation = final_dummies.groupby('person_id').max()
        self.OtherIniQuar = final_consolidation



    def conditionToCondition(self):
        """
        Generate a matrix of any condition that occurs after any condtions for each person, with 1 in the condtion->condition position.
        """
        
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        # Get all unique diseases
        diseases = self.df['event_concept_name'].unique()
        
        # Sort the df by person_id and svcdate to prepare for find the last condition
        df_sorted = self.df.sort_values(by=['person_id', 'svcdate']).copy()
        
        # compare with the previous 
        df_sorted['new_event'] = df_sorted.groupby('person_id')['event_concept_name'].shift() != df_sorted['event_concept_name']
        
        print(df_sorted)
        # Keep the first 
        filtered_df = df_sorted[df_sorted['new_event'].fillna(True)]
        
        filtered_df = filtered_df.drop(columns=['new_event'])
        # Initialize matrix with all possible disease transitions set to 0
        transition_matrix = pd.DataFrame(0, index=self.df['person_id'].unique(), columns=[f'{d1}->{d2}' for d1 in diseases for d2 in diseases if d1 != d2])

        # Iterate through each person_id
        for person_id in transition_matrix.index:
            # Get all records of this person_id and sort by svcdate
            records = filtered_df[filtered_df['person_id'] == person_id].sort_values(by='svcdate')

            # Iterate through each disease and see what diseases appear after it
            for i, row in records.iterrows():
                current_disease = row['event_concept_name']
                # Get all conditions after the current condition
                subsequent_diseases = records[records['svcdate'] > row['svcdate']]['event_concept_name'].unique()

                # Mark all conditions that appear after the current disease converted to 1
                for subsequent_disease in subsequent_diseases:
                    if current_disease != subsequent_disease:  
                        transition_matrix.at[person_id, f'{current_disease}->{subsequent_disease}'] = 1
        
        # column_sums = df.sum()  
        # columns_to_drop = column_sums[column_sums == 0].index
        # transition_matrix.drop(columns=columns_to_drop, inplace=True)          

        # Reset index and then set 'person_id' as the new index
        self.conditionPair = transition_matrix.reset_index().rename(columns={'index': 'person_id'}).set_index('person_id')

    def lastCondtionToCondition(self):
        """
        Generate a matrix of only the last condition that occurs after any condtions for each person, with 1 in the condtion->condition position.
        """

        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        # Get all unique diseases
        diseases = self.df['event_concept_name'].unique()
        
        # Sort the df by person_id and svcdate to prepare for find the last condition
        dfSorted = self.originalDf.sort_values(by=['person_id', 'svcdate'])
        
        # Keep the first appearance condition rows for each condition for each person
        firstAppearance = dfSorted.groupby(['person_id', 'event_concept_name']).first().reset_index()
        
        # Initialize matrix with all possible disease transitions set to 0
        transition_matrix = pd.DataFrame(0, index=self.df['person_id'].unique(), columns=[f'{d1}->{d2}' for d1 in diseases for d2 in diseases if d1 != d2])

        # Iterate through each person_id
        for person_id in transition_matrix.index:
            # Get all records of this person_id and sort by svcdate
            records = firstAppearance[firstAppearance['person_id'] == person_id].sort_values(by='svcdate')

            # Iterate through each disease and see what diseases appear after it
            for i, row in records.iterrows():
                current_disease = row['event_concept_name']
                # Get all conditions after the current condition
                subsequent_diseases = records[records['svcdate'] > row['svcdate']]['event_concept_name'].unique()

                # Mark all conditions that appear after the current disease converted to 1
                for subsequent_disease in subsequent_diseases:
                    if current_disease != subsequent_disease:  
                        transition_matrix.at[person_id, f'{current_disease}->{subsequent_disease}'] = 1
                        
        
        # Reset index and then set 'person_id' as the new index
        newConditionPair = transition_matrix.reset_index().rename(columns={'index': 'person_id'}).set_index('person_id')
        
        # Use XOR to keep the last condition pairs
        lastPair = newConditionPair ^ self.conditionPair
        self.lastConditionPair = lastPair
    
    
    
    def mergeDataframes(self):
        """
        Merge all dataframes in the object except for 'conditionPair' and 'df',
        using 'person_id' as the key.
        """
        if self.df is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        # Create a list to hold all dataframes to be merged
        dfs_to_merge = []

        # Check each attribute to see if it's a DataFrame and add it to the list
        # Exclude 'conditionPair' and 'df' explicitly
        for attr in [self.count, self.ageGenderDf, self.countEach, self.duration, self.firstCondition, self.firstIniQuar, self.conditionPair]:
        # for attr in [self.count, self.countEach, self.duration, self.firstDisease, self.firstIniQuar, self.conditionPair]:
            if isinstance(attr, pd.DataFrame):
                dfs_to_merge.append(attr)

        # Check if there is at least one DataFrame to merge
        if not dfs_to_merge:
            raise ValueError("No additional dataframes are loaded to merge.")

        # Merge all dataframes in the list along axis=1 (columns)
        # This assumes that 'person_id' is the index for all dataframes
        merged_df = pd.concat(dfs_to_merge, axis=1)

        
        self.mergedDf = merged_df

    def GetLastCondition(self):
        """
        Generate a matrix of only the last condition that occurs after any condtions for each person
        """
        
        # sort by person_id and svcdate
        sort_ed = self.df.sort_values(by=['person_id','svcdate'])
        
        # Only keep the last condition
        last_condition = sort_ed.drop_duplicates('person_id', keep='last')
        
        # Reset the index
        Reset_last_condition = last_condition.reset_index(drop =True)
        #reset the index
        # last_condition = last_condition.reset_index().rename(columns={'index': 'person_id'}).set_index('person_id')
        # newlast_condition = last_condition.reset_index().rename(columns={'index': 'person_id'}).set_index('person_id')
        new_ConditionPair = Reset_last_condition.rename(columns={'index': 'person_id'}).set_index('person_id')
        
        self.lastCondition = new_ConditionPair.drop(columns=['svcdate','age','gender_concept_name'])
    
    def directDeal(self, data):
        self.loadData(data)
        self.transformDf()
        self.getAgeGender()
        self.countEachType()
        self.countTypes()
        self.durationCalc()
        self.firstConditionDummy()
        self.firstConditionQuarterDummy()
        # self.conditionQuarterDummies()
        self.conditionToCondition()
        # self.lastCondtionToCondition()
        self.GetLastCondition()
        self.mergeDataframes()
        
        
        

In [6]:
dt = DiseaseTable('A')
dt.directDeal(data)
dt.mergedDf

      person_id event_concept_name    svcdate  age gender_concept_name  \
7045          2          Disease_D 2019-12-06   23              FEMALE   
1860          2          Disease_C 2021-07-07   37              FEMALE   
1987          2          Disease_D 2023-10-02   67              FEMALE   
1599          3          Disease_D 2020-09-10   25              FEMALE   
425           5          Disease_B 2022-03-12   27              FEMALE   
...         ...                ...        ...  ...                 ...   
593        2996          Disease_C 2019-08-10   25                MALE   
1550       2997          Disease_A 2020-08-23   21              FEMALE   
3329       2999          Disease_D 2020-01-14   23                MALE   
6727       2999          Disease_D 2020-06-13   64              FEMALE   
4816       2999          Disease_D 2020-06-22   22              FEMALE   

      new_event  
7045       True  
1860       True  
1987       True  
1599       True  
425        True  
...

Unnamed: 0_level_0,ConditionTypeCount,gender,age_0-10,age_10-20,age_20-30,age_30-40,age_40-50,age_50-60,age_60-70,age_70-80,...,Disease_B->Disease_A,Disease_B->Disease_E,Disease_A->Disease_D,Disease_A->Disease_C,Disease_A->Disease_B,Disease_A->Disease_E,Disease_E->Disease_D,Disease_E->Disease_C,Disease_E->Disease_B,Disease_E->Disease_A
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,1,False,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,1,1,False,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
5,1,1,False,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
6,1,0,False,False,False,False,False,True,False,False,...,0,0,0,0,0,0,0,0,0,0
8,1,1,False,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,1,0,False,False,False,False,False,True,False,False,...,0,0,0,0,0,0,0,0,0,0
2995,3,1,False,False,False,True,False,False,False,False,...,1,0,1,0,1,0,0,0,0,0
2996,1,0,False,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2997,1,1,False,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0


In [7]:
dt.conditionPair

Unnamed: 0_level_0,Disease_D->Disease_C,Disease_D->Disease_B,Disease_D->Disease_A,Disease_D->Disease_E,Disease_C->Disease_D,Disease_C->Disease_B,Disease_C->Disease_A,Disease_C->Disease_E,Disease_B->Disease_D,Disease_B->Disease_C,Disease_B->Disease_A,Disease_B->Disease_E,Disease_A->Disease_D,Disease_A->Disease_C,Disease_A->Disease_B,Disease_A->Disease_E,Disease_E->Disease_D,Disease_E->Disease_C,Disease_E->Disease_B,Disease_E->Disease_A
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2995,0,1,1,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
dt.getAgeGender()
dt.ageGenderDf

Unnamed: 0_level_0,gender,age_0-10,age_10-20,age_20-30,age_30-40,age_40-50,age_50-60,age_60-70,age_70-80,age_80-90,age_90-100
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,1,False,False,True,False,False,False,False,False,False,False
3,1,False,False,True,False,False,False,False,False,False,False
5,1,False,False,True,False,False,False,False,False,False,False
6,0,False,False,False,False,False,True,False,False,False,False
8,1,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
2994,0,False,False,False,False,False,True,False,False,False,False
2995,1,False,False,False,True,False,False,False,False,False,False
2996,0,False,False,True,False,False,False,False,False,False,False
2997,1,False,False,True,False,False,False,False,False,False,False


In [9]:
class SplittedTable:
    def __init__(self,name):
        self.allX = None # iniitiate the total X dataframe as none
        self.allY = None # initiate the total Y dataframe as none
        self.trainX = None # initiate the train X dataframe as none
        self.trainY = None # initiate the train Y dataframe as none
        self.testX = None # initiate the test X dataframe as none
        self.testY = None # initiate the test Y dataframe as none
        self.randomSeed = None  # initiate the random seed as none
        self.name = name # set hte name of this train and test set
        
    def setRandomSeed(self, seed):
        """
        Set the random seed for data splitting.

        Args:
            seed (int): The random seed value.
        """
        self.randomSeed = seed
        
    def loadData(self, inputX, inputY):
        """
        Get input from other dataframe or other format data

        Args:
            inputX (dataframe or other format data): Data to be stored as allX.
            inputY (dataframe or other format data):  Data to be stored as allY.
        """
        # Check if inputX is a DataFrame or convertible to a DataFrame
        if not isinstance(inputX, pd.DataFrame):
            try:
                self.allX = pd.DataFrame(inputX)
            except Exception as e:
                raise ValueError(f"inputX cannot be converted to a DataFrame: {e}")
        else:
            self.allX = inputX

        # Check if inputY is a DataFrame or convertible to a DataFrame
        if not isinstance(inputY, pd.DataFrame):
            try:
                self.allY = pd.DataFrame(inputY)
            except Exception as e:
                raise ValueError(f"inputY cannot be converted to a DataFrame: {e}")
        else:
            self.allY = inputY
            
    def splitData(self, train_size=0.8):
        """
            Split allX and allY into train set and test set based on the train_size(default 0.8)
            and random seed default is 42

            Args:
                train_size (int or float): If it's float, represent the proportion of the train sample size
                                            If it's int, represent the absolute num of the train sample size
        """
            
        if self.allX is None or self.allY is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        if not isinstance(train_size, (float, int)):
            raise ValueError("train_size must be a float or an int.")

        if isinstance(train_size, float) and not (0 < train_size < 1):
            raise ValueError("If train_size is a float, it must be between 0 and 1.")

        # Use the set random seed if available, otherwise default to 42
        random_state = self.randomSeed if self.randomSeed is not None else 42

        # Split allX and allY into training and test sets while keeping person_id aligned
        self.trainX, self.testX, self.trainY, self.testY = train_test_split(
            self.allX, self.allY, train_size=train_size, random_state=random_state, stratify=self.allY.index if 'person_id' in self.allY else None)

        # Ensure that the indices (person_id) are carried over to the new DataFrames
        self.trainY.index = self.trainX.index
        self.testY.index = self.testX.index

In [10]:
st = SplittedTable('A')
st.loadData(inputX = dt.mergedDf, inputY = dt.lastCondition)
st.allY
st.splitData()


In [11]:
st.trainX

Unnamed: 0_level_0,ConditionTypeCount,gender,age_0-10,age_10-20,age_20-30,age_30-40,age_40-50,age_50-60,age_60-70,age_70-80,...,Disease_B->Disease_A,Disease_B->Disease_E,Disease_A->Disease_D,Disease_A->Disease_C,Disease_A->Disease_B,Disease_A->Disease_E,Disease_E->Disease_D,Disease_E->Disease_C,Disease_E->Disease_B,Disease_E->Disease_A
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2669,4,0,False,False,False,False,True,False,False,False,...,0,1,0,1,1,1,0,1,1,0
579,2,0,False,False,False,False,False,False,True,False,...,0,0,0,0,0,0,0,0,0,0
2181,3,0,False,False,False,True,False,False,False,False,...,1,1,0,0,1,1,0,0,1,1
1099,4,0,False,False,False,False,False,True,False,False,...,0,0,1,1,1,0,0,0,0,0
2874,1,1,False,False,False,False,False,True,False,False,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2048,1,0,False,False,False,False,False,True,False,False,...,0,0,0,0,0,0,0,0,0,0
1354,1,1,False,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1396,1,1,False,False,False,False,True,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1603,2,1,False,False,True,False,False,False,False,False,...,0,0,1,0,0,0,0,0,0,0


In [12]:
st.trainY

Unnamed: 0_level_0,event_concept_name
person_id,Unnamed: 1_level_1
2669,Disease_E
579,Disease_B
2181,Disease_E
1099,Disease_B
2874,Disease_E
...,...
2048,Disease_C
1354,Disease_B
1396,Disease_C
1603,Disease_D


In [26]:
class ModelTrain:
    def __init__(self,name):
        self.trainX = None # initiate the train X dataframe as none
        self.trainY = None # initiate the train Y dataframe as none
        self.testX = None # initiate the test X dataframe as none
        self.testY = None # initiate the test Y dataframe as none 
        self.randomSeed = None # initiate the randSeed as none  
        self.models = None # initiate the model series as none
        self.modelPred = None # initiate the model prediction output
        self.labelEncode = None # initiate the label Encode object
        self.dtrain = None # initiate the format train set
        self.dtest = None # initiate the format test set
        self.classLength = None # initiate the num of the Y labels
        self.name = name # give the name to this object
       
    def loadSet(self, set):
        """
        Load the train and test set from the set.trainX set.trainY set.testX set.testY

        args:
        -set(object): the set object should contain 4 attributes, trainX, trainY, testX, testY
        """
        # Check the train and test sets are not None
        if set.trainX is None or set.trainY is None:
            raise ValueError("Training data is not loaded. Please load data and split it first using `splitData` method.")
        
        if set.trainX is None or set.trainY is None:
            raise ValueError("Testing data is not loaded. Please load data and split it first using `splitData` method.")

        # Set the train and test sets as attributes in this object
        self.trainX = set.trainX
        self.trainY = set.trainY
        self.testX = set.testX
        self.testY = set.testY
        self.randomSeed = set.randomSeed

    def TrainPrepare(self):
        """
        Convert the string type labels to num type labels
        """
        # Create a encoder object 
        label_encode = LabelEncoder()
        self.labelEncode = label_encode

        # Encode the train and test Y set
        trainY_encoded = label_encode.fit_transform(self.trainY)
        testY_encoded = label_encode.fit_transform(self.testY)

        # Convert the dataframe to the training formate
        dtrain = xgb.DMatrix(self.trainX, label=trainY_encoded, enable_categorical=True)
        dtest = xgb.DMatrix(self.testX, label=testY_encoded, enable_categorical=True)  
        self.dtrain = dtrain
        self.dtest = dtest

        self.classLength = label_encode.classes_

    def trainXGBoostModel(self):
        """
        Train multiple XGBoost models for a multiprob dataset, one per label in trainY.
        """

        # set params
        params = {
            # 'objective': 'multi:softprob',
            'objective': 'multi:softmax',
            'num_class': len(self.classLength),
            'max_depth': 4,
            'eta': 0.3,
            'verbosity': 1,
            'seed': self.randomSeed
        }

        num_rounds = 50

        model = xgb.train(params, self.dtrain, num_rounds)

        self.models = model
        

    def predModel(self):
        """
        Use the model in the test set and generate the prediction for the test set
        """
        predictions = self.models.predict(self.dtest)
        # print(predictions)
        self.modelPred = pd.DataFrame(predictions, columns = self.trainY.columns)
        ID = pd.DataFrame(self.testX.index)
        self.modelPred['person_id'] = ID
        self.modelPred.set_index('person_id', inplace=True)

In [27]:
model = ModelTrain('A model')
model.loadSet(st)
model.TrainPrepare()
model.trainXGBoostModel()
model.predModel()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [53]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

class ModelTrain:
    def __init__(self, name):
        self.trainX = None
        self.trainY = None
        self.testX = None
        self.testY = None
        self.randomSeed = None
        self.models = None
        self.modelPred = None
        self.labelEncode = None
        self.dtrain = None
        self.dtest = None
        self.classLength = None
        self.name = name
        self.test_indices = None  # 保存测试集索引

    def loadSet(self, set):
        if set.trainX is None or set.trainY is None:
            raise ValueError("Training data is not loaded. Please load data and split it first using `splitData` method.")
        
        if set.testX is None or set.testY is None:
            raise ValueError("Testing data is not loaded. Please load data and split it first using `splitData` method.")

        self.trainX = set.trainX
        self.trainY = set.trainY
        self.testX = set.testX
        self.testY = set.testY
        self.randomSeed = set.randomSeed
        self.test_indices = set.testX.index  # 保存测试集索引

    def TrainPrepare(self):
        label_encode = LabelEncoder()
        self.labelEncode = label_encode

        trainY_encoded = label_encode.fit_transform(self.trainY)
        testY_encoded = label_encode.transform(self.testY)

        dtrain = xgb.DMatrix(self.trainX, label=trainY_encoded, enable_categorical=True)
        dtest = xgb.DMatrix(self.testX, label=testY_encoded, enable_categorical=True)
        self.dtrain = dtrain
        self.dtest = dtest

        self.classLength = label_encode.classes_

    def trainXGBoostModel(self):
        params = {
            'objective': 'multi:softmax',
            'num_class': len(self.classLength),
            'max_depth': 4,
            'eta': 0.3,
            'verbosity': 1,
            'seed': self.randomSeed
        }

        num_rounds = 50
        model = xgb.train(params, self.dtrain, num_rounds)
        self.models = model

    def predModel(self):
        predictions = self.models.predict(self.dtest)
        self.modelPred = pd.DataFrame(predictions, columns=['prediction'])
        self.modelPred['person_id'] = self.test_indices  # 使用保存的测试集索引
        self.modelPred.set_index('person_id', inplace=True)
        
    def calculate_accuracy(self):
        if self.modelPred is None:
            raise ValueError("No predictions found. Please run predModel() first.")
        y_true = self.labelEncode.transform(self.testY)
        y_pred = self.modelPred['prediction'].values
        accuracy = accuracy_score(y_true, y_pred)
        return accuracy

class FeatureSelector(ModelTrain):
    def __init__(self, name):
        super().__init__(name)
        self.best_n_features = None
        self.best_score = 0.0

    def fit(self, X, y, cv=5):
        # 对目标变量进行编码
        label_encode = LabelEncoder()
        y_encoded = label_encode.fit_transform(y)

        estimator = LinearRegression()
        max_features_to_select = X.shape[1]
        n_features_to_select_range = range(1, max_features_to_select + 1)

        for n_features_to_select in n_features_to_select_range:
            rfe = RFE(estimator, n_features_to_select=n_features_to_select)
            scores = cross_val_score(rfe, X, y_encoded, cv=cv)
            score = scores.mean()
            if score > self.best_score:
                self.best_score = score
                self.best_n_features = n_features_to_select

    def transform(self, X, y):
        # 对目标变量进行编码
        label_encode = LabelEncoder()
        y_encoded = label_encode.fit_transform(y)

        estimator = LinearRegression()
        rfe = RFE(estimator, n_features_to_select=self.best_n_features)
        X_transformed = rfe.fit_transform(X, y_encoded)
        return X_transformed

    def loadSet(self, set):
        super().loadSet(set)
        self.fit(self.trainX, self.trainY)
        self.trainX = self.transform(self.trainX, self.trainY)
        self.testX = self.transform(self.testX, self.testY)

In [42]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, train_test_split


In [54]:
model = FeatureSelector("XGBoost with Feature Selection")

    # 加载数据集
model.loadSet(st)

    # 准备训练数据
model.TrainPrepare()

    # 训练模型
model.trainXGBoostModel()

    # 进行预测
model.predModel()

accuracy = model.calculate_accuracy()
print(f"Model accuracy: {accuracy}")

    # 显示预测结果
print(model.modelPred)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Model accuracy: 0.9012605042016807
           prediction
person_id            
1762              3.0
2734              3.0
2513              1.0
2507              0.0
1363              3.0
...               ...
836               3.0
460               4.0
1779              0.0
780               1.0
1635              1.0

[476 rows x 1 columns]


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [49]:
model.modelPred

Unnamed: 0_level_0,prediction
person_id,Unnamed: 1_level_1
1762,3.0
2734,3.0
2513,1.0
2507,0.0
1363,3.0
...,...
836,3.0
460,4.0
1779,0.0
780,1.0


In [16]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 加载数据
data = load_iris()
X = data.data
y = data.target
print(y)
# 将标签转换为整数
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(y_encoded)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 设置 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 配置参数
params = {
    'objective': 'multi:softprob',
    'num_class': 3,
    'max_depth': 4,
    'eta': 0.3,
    'verbosity': 1
}

# 训练模型
num_round = 100
bst = xgb.train(params, dtrain, num_round)

# 模型评估
preds = bst.predict(dtest)
preds
# accuracy = float(np.sum(preds == y_test)) / len(y_test)
# accuracy * 100  # 返回准确率的百分比


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


array([[3.1512976e-03, 9.8484683e-01, 1.2001900e-02],
       [9.8934221e-01, 9.5407646e-03, 1.1170481e-03],
       [4.6061169e-04, 1.5947862e-03, 9.9794465e-01],
       [2.3259702e-03, 9.9090135e-01, 6.7726560e-03],
       [3.1193048e-03, 9.5425969e-01, 4.2620972e-02],
       [9.9501896e-01, 3.9775842e-03, 1.0034940e-03],
       [4.6410579e-03, 9.9352819e-01, 1.8307610e-03],
       [2.6571595e-03, 1.5134951e-02, 9.8220783e-01],
       [1.2791863e-03, 9.7751755e-01, 2.1203250e-02],
       [1.9170205e-03, 9.9682707e-01, 1.2559054e-03],
       [2.7165050e-03, 1.5802829e-02, 9.8148066e-01],
       [9.9563521e-01, 2.9989877e-03, 1.3658080e-03],
       [9.9354231e-01, 5.4557114e-03, 1.0020048e-03],
       [9.9599540e-01, 3.0000727e-03, 1.0044788e-03],
       [9.9599540e-01, 3.0000727e-03, 1.0044788e-03],
       [8.3223108e-04, 9.9787819e-01, 1.2895756e-03],
       [4.1353976e-04, 9.9661562e-04, 9.9858981e-01],
       [2.1218199e-03, 9.9482286e-01, 3.0552924e-03],
       [4.6261596e-03, 9.903

In [17]:


class modelingTable:
    def __init__(self,name):
        
        self.allX = None # iniitiate the total X dataframe as none
        self.allY = None # initiate the total Y dataframe as none
        self.trainX = None # initiate the train X dataframe as none 
        self.trainY = None # initiate the train Y dataframe as none
        self.testX = None # initiate the test X dataframe as none
        self.testY = None # initiate the test Y dataframe as none
        self.randomSeed = None  # initiate the random seed as none
        self.models = None # initiate the model series as none
        self.modelPred = None # initiate the model prediction output
        self.name = name # initiate the name
        self.newFeatureX = None # initiate the top features
        self.selectTrainX = None # initiate the selected train X
        self.selectTestX = None # initiate the selected test X
        self.selectedModel = None # initiate the selected model
        
    def setRandomSeed(self, seed):
        """
        Set the random seed for data splitting.

        Args:
            seed (int): The random seed value.
        """
        self.randomSeed = seed
        
    def loadData(self, inputX, inputY):
        """
        Get input from other dataframe or other format data

        Args:
            inputX (dataframe or other format data): Data to be stored as allX.
            inputY (dataframe or other format data):  Data to be stored as allY.
        """
        # Check if inputX is a DataFrame or convertible to a DataFrame
        if not isinstance(inputX, pd.DataFrame):
            try:
                self.allX = pd.DataFrame(inputX)
            except Exception as e:
                raise ValueError(f"inputX cannot be converted to a DataFrame: {e}")
        else:
            self.allX = inputX

        # Check if inputY is a DataFrame or convertible to a DataFrame
        if not isinstance(inputY, pd.DataFrame):
            try:
                self.allY = pd.DataFrame(inputY)
            except Exception as e:
                raise ValueError(f"inputY cannot be converted to a DataFrame: {e}")
        else:
            self.allY = inputY
            
    def splitData(self, train_size=0.8):
        """
        Split allX and allY into train set and test set based on the train_size(default 0.8)
        and random seed default is 42

        Args:
            train_size (int or float): If it's float, represent the proportion of the train sample size
                                       If it's int, represent the absolute num of the train sample size
        """
        
        if self.allX is None or self.allY is None:
            raise ValueError("Data is not loaded. Please load data first using `loadData` method.")

        if not isinstance(train_size, (float, int)):
            raise ValueError("train_size must be a float or an int.")

        if isinstance(train_size, float) and not (0 < train_size < 1):
            raise ValueError("If train_size is a float, it must be between 0 and 1.")

        # Use the set random seed if available, otherwise default to 42
        random_state = self.randomSeed if self.randomSeed is not None else 42

        # Split allX and allY into training and test sets while keeping person_id aligned
        self.trainX, self.testX, self.trainY, self.testY = train_test_split(
            self.allX, self.allY, train_size=train_size, random_state=random_state, stratify=self.allY.index if 'person_id' in self.allY else None)

        # Ensure that the indices (person_id) are carried over to the new DataFrames
        self.trainY.index = self.trainX.index
        self.testY.index = self.testX.index
        
    # still working
    

    def trainXGBoostModel(self):
        """
        Train multiple XGBoost models for a multi-label dataset, one per label in trainY.
        """
        if self.trainX is None or self.trainY is None:
            raise ValueError("Training data is not loaded. Please load data and split it first using `splitData` method.")
                
        # Convert multi classification model to multi labels model
        xgb_classifier = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=10, min_child_weight=1,
                        gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                        nthread=4, scale_pos_weight=1, seed=27, reg_lambda=1, reg_alpha=0)

        # Use MultiOutputClassifier package XGBoost classifier
        multioutput_classifier = MultiOutputClassifier(xgb_classifier)

        # train the model
        multioutput_classifier.fit(self.trainX, self.trainY)
            
        self.models = multioutput_classifier
        
    def predModel(self):
        """
        Use the model in the test set and generate the prediction for the test set
        """
        predictions = self.models.predict(self.testX)
        self.modelPred = predictions_df = pd.DataFrame(predictions, columns = self.trainY.columns)
        ID = pd.DataFrame(mdf.testX.index)
        self.modelPred['person_id'] = ID
        self.modelPred
    
    def featureSelection(self):
        """
        Calculate the feature importance of the model and get the top importance features and plot the barchart to visualize it 
        """
        feature_importances = np.zeros(self.trainX.shape[1])
        for estimator in self.models.estimators_:
            feature_importances += estimator.feature_importances_

        feature_importances /= len(self.models.estimators_)

        
        indices = np.argsort(feature_importances)[-20:]
        self.newFeatureX = indices 
        self.selectTrainX = self.trainX.iloc[:, indices] 
        self.selectTestX = self.testX.iloc[:, indices] 
        
        plt.figure(figsize=(20, 6))
        plt.barh(range(20), feature_importances[indices], color='skyblue')
        plt.yticks(range(20), [f'Feature {i}' for i in indices])
        plt.xlabel('Importance')
        plt.ylabel('Feature Index')
        plt.title('Top 20 Important Features')
        plt.tight_layout()
        plt.show()
        
    def accuracyModel(self):
        """
        Calculate the accuracy of the model by the testY set and the prediction value set
        """
        accuracy = accuracy_score(mdf.testY, mdf.modelPred)
        print("The accuracy of the model is: ", accuracy)
        
    def selectedXGboostModel(self):
        """
        Use the new selected feature to build a model and calculate its accuracy
        """
        if self.selectTrainX is None or self.trainY is None:
            raise ValueError("Training data is not loaded. Please load data and split it first using `splitData` method.")
                
        # Convert multi classification model to multi labels model
        xgb_classifier = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=10, min_child_weight=1,
                        gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                        nthread=4, scale_pos_weight=1, seed=27, reg_lambda=1, reg_alpha=0)

        # Use MultiOutputClassifier package XGBoost classifier
        multioutput_classifier = MultiOutputClassifier(xgb_classifier)

        # train the model
        multioutput_classifier.fit(self.selectTrainX, self.trainY)
            
        self.selectedModel = multioutput_classifier
        predictions = self.selectedModel.predict(self.selectTestX)
        selected_pred = predictions_df = pd.DataFrame(predictions, columns = self.trainY.columns)
        
        accuracy = accuracy_score(mdf.testY, selected_pred)
        print("The accuracy of the selected model is: ", accuracy)

In [30]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

class ModelTrain:
    def __init__(self, name):
        self.trainX = None
        self.trainY = None
        self.testX = None
        self.testY = None
        self.randomSeed = None
        self.models = None
        self.modelPred = None
        self.labelEncode = None
        self.dtrain = None
        self.dtest = None
        self.classLength = None
        self.name = name

    def loadSet(self, set):
        if set.trainX is None or set.trainY is None:
            raise ValueError("Training data is not loaded. Please load data and split it first using `splitData` method.")
        
        if set.testX is None or set.testY is None:
            raise ValueError("Testing data is not loaded. Please load data and split it first using `splitData` method.")

        self.trainX = set.trainX
        self.trainY = set.trainY
        self.testX = set.testX
        self.testY = set.testY
        self.randomSeed = set.randomSeed

    def TrainPrepare(self):
        label_encode = LabelEncoder()
        self.labelEncode = label_encode

        trainY_encoded = label_encode.fit_transform(self.trainY)
        testY_encoded = label_encode.transform(self.testY)

        dtrain = xgb.DMatrix(self.trainX, label=trainY_encoded, enable_categorical=True)
        dtest = xgb.DMatrix(self.testX, label=testY_encoded, enable_categorical=True)
        self.dtrain = dtrain
        self.dtest = dtest

        self.classLength = label_encode.classes_

    def trainXGBoostModel(self):
        params = {
            'objective': 'multi:softmax',
            'num_class': len(self.classLength),
            'max_depth': 4,
            'eta': 0.3,
            'verbosity': 1,
            'seed': self.randomSeed
        }

        num_rounds = 50
        model = xgb.train(params, self.dtrain, num_rounds)
        self.models = model

    def predModel(self):
        predictions = self.models.predict(self.dtest)
        self.modelPred = pd.DataFrame(predictions, columns=['prediction'])
        self.modelPred['person_id'] = self.testX.index
        self.modelPred.set_index('person_id', inplace=True)

class FeatureSelector(ModelTrain):
    def __init__(self, name):
        super().__init__(name)
        self.best_n_features = None
        self.best_score = 0.0

    def fit(self, X, y, cv=5):
        estimator = LinearRegression()
        max_features_to_select = X.shape[1]
        n_features_to_select_range = range(1, max_features_to_select + 1)

        for n_features_to_select in n_features_to_select_range:
            rfe = RFE(estimator, n_features_to_select=n_features_to_select)
            scores = cross_val_score(rfe, X, y, cv=cv)
            score = scores.mean()
            if score > self.best_score:
                self.best_score = score
                self.best_n_features = n_features_to_select

    def transform(self, X):
        estimator = LinearRegression()
        rfe = RFE(estimator, n_features_to_select=self.best_n_features)
        X_transformed = rfe.fit_transform(X, self.trainY)
        return X_transformed

    def loadSet(self, set):
        super().loadSet(set)
        self.fit(self.trainX, self.trainY)
        self.trainX = self.transform(self.trainX)
        self.testX = self.transform(self.testX)

# 使用示例
if __name__ == "__main__":
    # 加载波士顿房价数据集
    boston = load_boston()
    X, y = boston.data, boston.target

    # 将数据分为训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 创建数据集对象
    class Dataset:
        def __init__(self, trainX, trainY, testX, testY, randomSeed=42):
            self.trainX = trainX
            self.trainY = trainY
            self.testX = testX
            self.testY = testY
            self.randomSeed = randomSeed

    dataset = Dataset(X_train, y_train, X_test, y_test)

    # 创建 FeatureSelector 实例
    model = FeatureSelector("XGBoost with Feature Selection")

    # 加载数据集
    model.loadSet(dataset)

    # 准备训练数据
    model.TrainPrepare()

    # 训练模型
    model.trainXGBoostModel()

    # 进行预测
    model.predModel()

    # 显示预测结果
    print(model.modelPred.head())


NameError: name 'load_boston' is not defined