In [None]:
def check_NAs(df,val):

    # strip leading and trailing whitespace in columns
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    
    nan_perc = (df.isnull().sum() / len(df)) * 100
    nan_df = nan_perc.to_frame().reset_index()
    if len(nan_df[nan_df[0]>0]) > 0:
        print('\nNAs present in the columns of ',val,':')
        nan_df1 = nan_df[nan_df[0]>0].sort_values(0,ascending=False).rename(columns={'index':'column',0:'%'})
        nan_df1['%'] = nan_df1['%'].round(2)
        print(nan_df1)
    else:
        print('\nNo NAs present in the columns of ',val)

    nan_perc = (df.eq('').sum() / len(df)) * 100
    nan_df = nan_perc.to_frame().reset_index()
    if len(nan_df[nan_df[0]>0]) > 0:
        print('\nempty values present in the columns of ',val,':')
        nan_df1 = nan_df[nan_df[0]>0].sort_values(0,ascending=False).rename(columns={'index':'column',0:'%'})
        nan_df1['%'] = nan_df1['%'].round(2)
        print(nan_df1)
    else:
        print('\nNo empty values present in the columns of ',val)

    nan_perc = (df.eq(0).sum() / len(df)) * 100
    nan_df = nan_perc.to_frame().reset_index()
    if len(nan_df[nan_df[0]>0]) > 0:
        print('\n0 values present in the columns of ',val,':')
        nan_df1 = nan_df[nan_df[0]>0].sort_values(0,ascending=False).rename(columns={'index':'column',0:'%'})
        nan_df1['%'] = nan_df1['%'].round(2)
        print(nan_df1)
    else:
        print('\nNo 0 values present in the columns of ',val)

    return df

def get_data(file):
    return (
        
        pd.read_csv(file,header=None) 
        .rename(columns={k : col_dict[k][0] for k in col_dict})
        .drop('instance_weight_ignore',axis=1)    
    )

def anomalies(df):
    
    anomaly_workers = df[((df['wage_per_hour']>0)|
     (df['tax_filer_status']!='Nonfiler')|(df['num_persons_worked_for_employer']>0)|(df['weeks_worked_in_year']>0))
     &(df['age']<13)][['age','wage_per_hour','tax_filer_status','num_persons_worked_for_employer'
                              ,'weeks_worked_in_year','target']].drop_duplicates()
    
    
    anomaly_married = df[(df['marital_status']!='Never married')&(df['age']<15)][['age','marital_status','target']].drop_duplicates()

    return anomaly_workers,anomaly_married

def remove_anomalies(orig_df,anom_df):
    orig_df = orig_df[~(orig_df.index).isin(anom_df.index)]
    orig_df = orig_df.reset_index(drop=True)
    return orig_df

def ft_pt_func(df):

    ft_pt = ['Full-time schedules', 'PT for econ reasons usually PT', 'PT for non-econ reasons usually FT']
    arm_ch_pt = ['Children or Armed Forces', 'PT for econ reasons usually FT']
    unemp = ['Unemployed full-time', 'Unemployed part- time', 'Not in labor force']
    
    df['ft_pt_group'] = ''
    df.loc[df['full_or_part_time_employment_stat'].isin(ft_pt),'ft_pt_group'] = 'full_part'
    df.loc[df['full_or_part_time_employment_stat'].isin(arm_ch_pt),'ft_pt_group'] = 'arm_child_partt'
    df.loc[df['full_or_part_time_employment_stat'].isin(unemp),'ft_pt_group'] = 'unemp'
    
    # print(df['education_group'].value_counts(normalize=True))
    
    del df['full_or_part_time_employment_stat']

    return df
    
def mig_msa(df):

    non_mov_q = ['?', 'Nonmover']
    to_msa_nonmsa = ['MSA to MSA', 'NonMSA to MSA', 'Not identifiable', 'Abroad to MSA', 'MSA to nonMSA']
    to_nonmsa = ['NonMSA to nonMSA', 'Abroad to nonMSA', 'Not in universe']

    df['mig_msa_group'] = ''
    df.loc[df['migration_code_change_in_msa'].isin(non_mov_q),'mig_msa_group'] = 'non_mover'
    df.loc[df['migration_code_change_in_msa'].isin(to_msa_nonmsa),'mig_msa_group'] = 'to_msa_nonmsa'
    df.loc[df['migration_code_change_in_msa'].isin(to_nonmsa),'mig_msa_group'] = 'to_nonmsa'
    
    # print(df['education_group'].value_counts(normalize=True))
    
    del df['migration_code_change_in_msa']

    return df
    
def age_bucket(df):
    # len(df_train_a[df_train_a['age']<=0])
    df['age_bucket'] = ''
    df.loc[(df['age']<=24),'age_bucket'] = '0_24'
    df.loc[(df['age']>=25)&(df['age']<=37),'age_bucket'] = '25_37'
    df.loc[(df['age']>=38)&(df['age']<=52),'age_bucket'] = '38_52'
    df.loc[(df['age']>=53),'age_bucket'] = '53+'
    
    # df_train_a['age_bucket'].value_counts(normalize=True)
    
    del df['age']

    return df

def weeks_bucket(df):
    # len(df_train_a[df_train_a['age']<=0])
    df['weeks_bucket'] = ''
    df.loc[(df['weeks_worked_in_year']<=0),'weeks_bucket'] = '0'
    df.loc[(df['weeks_worked_in_year']>=1)&(df['weeks_worked_in_year']<=26),'weeks_bucket'] = '1_26'
    df.loc[(df['weeks_worked_in_year']>=27)&(df['weeks_worked_in_year']<=39),'weeks_bucket'] = '27_39'
    df.loc[(df['weeks_worked_in_year']>=40)&(df['weeks_worked_in_year']<=51),'weeks_bucket'] = '40_51'
    df.loc[(df['weeks_worked_in_year']>=52),'weeks_bucket'] = '52+'
    
    # df_train_a['age_bucket'].value_counts(normalize=True)
    
    del df['weeks_worked_in_year']

    return df

def wage_bucket(df):
    # len(df_train_a[df_train_a['age']<=0])
    df['wage_bucket'] = ''
    df.loc[(df['wage_per_hour']<=0),'wage_bucket'] = '0'
    df.loc[(df['wage_per_hour']>0),'wage_bucket'] = '1'
    
    # df_train_a['age_bucket'].value_counts(normalize=True)
    
    del df['wage_per_hour']

    return df


def education_bucket(df):
    df['education'] = df['education'].str.strip()

    
    profess_degree = ['Prof school degree (MD DDS DVM LLB JD)','Doctorate degree(PhD EdD)']
    masters_degree = ['Masters degree(MA MS MEng MEd MSW MBA)']
    bachelors_degree = ['Bachelors degree(BA AB BS)']
    assoc_grad = ['Associates degree-academic program', 'Associates degree-occup /vocational', 'Some college but no degree', 'High school graduate']
    less_than_12th = ['12th grade no diploma', '7th and 8th grade', '11th grade', '10th grade', '9th grade', '1st 2nd 3rd or 4th grade', '5th or 6th grade', 'Less than 1st grade', 'Children']

    
    
    df['education_group'] = ''
    df.loc[df['education'].isin(profess_degree),'education_group'] = 'profess'
    df.loc[df['education'].isin(masters_degree),'education_group'] = 'masters'
    df.loc[df['education'].isin(bachelors_degree),'education_group'] = 'bachelors'
    df.loc[df['education'].isin(assoc_grad),'education_group'] = 'assoc_grd'
    df.loc[df['education'].isin(less_than_12th),'education_group'] = '<_12th'
    
    # df.loc[df['education'].isin(degree),'education_group'] = 'degree'
    # df.loc[~df['education'].isin(degree),'education_group'] = 'non_degree'
    
    # print(df['education_group'].value_counts(normalize=True))
    
    del df['education']

    return df

def married_bucket(df):
    
    df['marital_status'] = df['marital_status'].str.strip()
    
    married_spouse_present = ['Married-civilian spouse present']
    divorced = ['Divorced']
    married_spouse_abs = ['Married-spouse absent']
    sep_wid = ['Separated', 'Widowed']
    never_af = ['Never married', 'Married-A F spouse present']    
    
    df['married_or_not'] = ''
    df.loc[df['marital_status'].isin(married_spouse_present),'married_or_not'] = 'M_spouse_pr'
    df.loc[df['marital_status'].isin(divorced),'married_or_not'] = 'divorced'
    df.loc[df['marital_status'].isin(married_spouse_abs),'married_or_not'] = 'M_spouse_ab'
    df.loc[df['marital_status'].isin(sep_wid),'married_or_not'] = 'sep_wid'
    df.loc[df['marital_status'].isin(never_af),'married_or_not'] = 'never_M_AF'
    
    # print(df['married_or_not'].value_counts(normalize=True))
    
    del df['marital_status']

    return df

def race_bucket(df):
    df['race'] = df['race'].str.strip()
    
    df['white_or_nonwhite'] = ''
    df.loc[df['race'].isin(['White']),'white_or_nonwhite'] = 'white'
    df.loc[~df['race'].isin(['White']),'white_or_nonwhite'] = 'non_white'
    
    # print(df['white_or_nonwhite'].value_counts(normalize=True))
    
    del df['race']

    return df

def hispanic_bucket(df):
    # print(df_train_a['hispanic_Origin'].value_counts(normalize=True))
    
    df['hispanic_Origin'] = df['hispanic_Origin'].str.strip()
    
    other = [ 'Do not know','NA','All other']
    hispanic = ['Central or South American','Mexican (Mexicano)', 'Mexican-American', 'Other Spanish','Puerto Rican', 'Cuban', 'Chicano']
    
    df['hispanic_or_other'] = ''
    df.loc[df['hispanic_Origin'].isin(other),'hispanic_or_other'] = 'other'
    df.loc[df['hispanic_Origin'].isin(hispanic),'hispanic_or_other'] = 'hispanic'
    
    # print(df['hispanic_or_other'].value_counts(normalize=True))
    
    del df['hispanic_Origin']
    return df

def birth_country_bucket(df):
    df['country_of_birth_father'] = df['country_of_birth_father'].str.strip()
    df['country_of_birth_mother'] = df['country_of_birth_mother'].str.strip()
    df['country_of_birth_self'] = df['country_of_birth_self'].str.strip()
    
    df['immigrant_parents'] = ''
    df.loc[(df['country_of_birth_father']=='United-States')|
                    (df['country_of_birth_mother']=='United-States'),'immigrant_parents'] = 'no'
    
    df.loc[(df['country_of_birth_father']!='United-States')&
                    (df['country_of_birth_mother']!='United-States'),'immigrant_parents'] = 'yes'
    
    # print(df['immigrant_parents'].value_counts(normalize=True))
    
    df['immigrant'] = ''
    df.loc[(df['country_of_birth_self']=='United-States'),'immigrant'] = 'no'
    df.loc[(df['country_of_birth_self']!='United-States'),'immigrant'] = 'yes'
    
    # print(df['immigrant'].value_counts(normalize=True))
    
    del df['country_of_birth_father']
    del df['country_of_birth_mother']
    del df['country_of_birth_self']
    return df

def citizen_bucket(df):
    # df_train_a['citizenship'].value_counts(normalize=True)
    df['citizenship'] = df['citizenship'].str.strip()
    
    df['us_citizen'] = ''
    df.loc[(df['citizenship']=='Foreign born- Not a citizen of U S'),'us_citizen'] = 'no'
    df.loc[(df['citizenship']!='Foreign born- Not a citizen of U S'),'us_citizen'] = 'yes'
    
    # print(df['us_citizen'].value_counts(normalize=True))
    
    del df['citizenship']
    return df

def capital_net(df):
    df['capital_net'] = df['capital_gains'] - df['capital_losses'] + df['divdends_from_stocks']
    
    del df['capital_gains']
    del df['capital_losses']
    del df['divdends_from_stocks']
    return df

def capital(df):
    #df['capital_net'] = df['capital_gains'] - df['capital_losses'] + df['divdends_from_stocks']
    
    df['capital_gains_bucket'] = ''
    df['capital_losses_bucket'] = ''
    df['divdends_from_stocks_bucket'] = ''
    df.loc[(df['capital_gains']>0),'capital_gains_bucket'] = 1
    df.loc[(df['capital_gains']<=0),'capital_gains_bucket'] = 0
    df.loc[(df['capital_losses']>0),'capital_losses_bucket'] = 1
    df.loc[(df['capital_losses']<=0),'capital_losses_bucket'] = 0
    df.loc[(df['divdends_from_stocks']>0),'divdends_from_stocks_bucket'] = 1
    df.loc[(df['divdends_from_stocks']<=0),'divdends_from_stocks_bucket'] = 0
    # print(df['us_citizen'].value_counts(normalize=True))
    
    del df['capital_gains']
    del df['capital_losses']
    del df['divdends_from_stocks']
    return df

def worker_bucket(df):
    df['class_of_worker'] = df['class_of_worker'].str.strip()

    self_inc = ['Self-employed-incorporated']
    fed = ['Federal government']
    self_gov_priv = ['Self-employed-not incorporated', 'State government', 'Local government', 'Private']
    na_wo = ['Not in universe', 'Without pay', 'Never worked']
    
    df['worker_class'] = ''
    df.loc[(df['class_of_worker'].isin(self_inc)),'worker_class'] = 'self_incrp'
    df.loc[(df['class_of_worker'].isin(fed)),'worker_class'] = 'federal'
    df.loc[(df['class_of_worker'].isin(self_gov_priv)),'worker_class'] = 'self_gov_priv'
    df.loc[(df['class_of_worker'].isin(na_wo)),'worker_class'] = 'na_wo_pay'
    
    
    # print(df['worker_class'].value_counts(normalize=True))
    
    del df['class_of_worker']

    return df

def householder(df):
    df['detailed_household_summary_in_household'] = df['detailed_household_summary_in_household'].str.strip()
    
    householder = ['Householder']
    spouse_nonrel = ['Spouse of householder', 'Nonrelative of householder']
    other = ['Other relative of householder', 'Child 18 or older', 'Group Quarters- Secondary individual',
        'Child under 18 never married', 'Child under 18 ever married']

    df['household'] = ''
    df.loc[(df['detailed_household_summary_in_household'].isin(householder)),'household'] = 'HH'
    df.loc[(df['detailed_household_summary_in_household'].isin(spouse_nonrel)),'household'] = 'spouse_norel'
    df.loc[(df['detailed_household_summary_in_household'].isin(other)),'household'] = 'other'
    
    
    # print(df['household'].value_counts(normalize=True))
    
    del df['detailed_household_summary_in_household']
    return df

def tax(df):
    df['tax_filer_status'] = df['tax_filer_status'].str.strip()
    
    joint_both_under = ['Joint both under 65']
    rest = ['Joint one under 65 & one 65+', 'Joint both 65+', 'Single', 'Head of household']
    non = ['Nonfiler']

    df['tax'] = ''
    df.loc[(df['tax_filer_status'].isin(joint_both_under)),'tax'] = 'joint_both_under'
    df.loc[(df['tax_filer_status'].isin(rest)),'tax'] = 'rest'
    df.loc[(df['tax_filer_status'].isin(non)),'tax'] = 'non_filer'
    
    
    # print(df['tax_filer_status'].value_counts(normalize=True))
    
    del df['tax_filer_status']
    return df

def occupation(df):
    df['major_occupation_code'] = df['major_occupation_code'].str.strip()

    exec_arm = ['Executive admin and managerial', 'Professional specialty', 'Armed Forces']
    group1 = ['Protective services', 'Sales', 'Technicians and related support', 'Precision production craft & repair']
    group2 = ['Transportation and material moving', 'Farming forestry and fishing', 'Machine operators assmblrs & inspctrs',
        'Adm support including clerical', 'Handlers equip cleaners etc', 'Not in universe', 'Other service', 'Private household services']
    
    df['prof'] = ''
    df.loc[(df['major_occupation_code'].isin(exec_arm)),'prof'] = 'exec_army'
    df.loc[(df['major_occupation_code'].isin(group1)),'prof'] = 'sales_tech_protect'
    df.loc[(df['major_occupation_code'].isin(group2)),'prof'] = 'primary'
    
    # print(df['major_occupation_code'].value_counts(normalize=True))
    
    del df['major_occupation_code']
    return df

def pie_plot(col):
    df = df_train_a.groupby([col,target_col],as_index=False).size()
    
    # Calculate the total size for each worker class
    total_size = df.groupby(col)['size'].sum()
    
    # Create a pivot table to separate target 0 and target 1
    pivot_table = df.pivot(index=col, columns='target', values='size')
    
    # Plot the pie charts
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Pie chart 1: Total distribution of worker classes
    axes[0].pie(total_size, labels=None, autopct='%1.1f%%', startangle=90,colors=sns.color_palette('Set3'), pctdistance=0.85)
    axes[0].set_title('Total Distribution')
    
    # Pie chart 2: Distribution of worker classes in target 0
    axes[1].pie(pivot_table[0], labels=None, autopct='%1.1f%%', startangle=90,colors=sns.color_palette('Set3'), pctdistance=0.85)
    axes[1].set_title('Distribution of < $50k income')
    
    # Pie chart 3: Distribution of worker classes in target 1
    axes[2].pie(pivot_table[1], labels=None, autopct='%1.1f%%', startangle=90,colors=sns.color_palette('Set3'), pctdistance=0.85)
    axes[2].set_title('Distribution of > $50k income')
    
    # Add a legend
    plt.legend(title=col, loc='upper right', labels=total_size.index)

    #axes[0].legend(total_size.index, title=col, loc='upper right')
    #plt.title(col)
    
    plt.tight_layout()
    plt.show()

def pivot_grp(df,col):
    total_size = df.groupby([col,target_col]).size().reset_index().rename(columns={0:'size'})
    
    # Create a pivot table to separate target 0 and target 1
    pivot_table = total_size.pivot(index=col, columns='target', values='size')
    piv = pivot_table.reset_index()
    piv['0%'] = (piv[0]*100  / piv[0].sum()).round(2)
    piv['1%'] = (piv[1]*100 / piv[1].sum()).round(2)
    piv['%difference'] = (piv['1%']) - (piv['0%'] )
    return piv[[col,'0%','1%','%difference']].sort_values(by=['%difference'],ascending=False)

def hyper_lr(X,y):
    solvers = ['lbfgs','newton-cg','liblinear','sag','saga']
    penalty = ['l1', 'l2', 'elasticnet', 'none']
    C = np.logspace(-4, 4, 20)
    
    # Define compatible pairs
    compatible_pairs = {
        'liblinear': ['l1', 'l2'],
        'saga': ['l1', 'l2', 'elasticnet', 'none'],
        'lbfgs': ['l2', 'none'],
        'newton-cg': ['l2', 'none'],
        'sag': ['l2', 'none']
    }
    
    # Create a new grid ensuring compatibility
    new_grid = []
    for solver in solvers:
        for pen in compatible_pairs[solver]:
            for c in C:
                new_grid.append({'solver': [solver], 'penalty': [pen], 'C': [c]})
    
    
    before = time.time()
    
    clf2 = LogisticRegression()
    
    # define grid search
    # grid = dict(solver=solvers,penalty=penalty,C=C)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    grid_search = RandomizedSearchCV(estimator=clf2, param_distributions=new_grid, n_jobs=-1, cv=cv, scoring='f1_macro',error_score=0)
    # grid_result = grid_search.fit(X_train_scaled, y_train)
    grid_result = grid_search.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    
    after = time.time()
    print(after-before,'seconds')
    
    return grid_result
    
def lr_report(grid_result,X,y):
    best_params = grid_result.best_params_
    
    clf3 = LogisticRegression(**best_params).fit(X,y)
    y_pred2 = clf3.predict(X_test_scaled)
    print(classification_report(y_test, y_pred2))
    return clf3

def save_model(model,path):
    # Save the trained model to a file
    joblib.dump(model, path)

def read_model(path):
    # Load the saved model from the file
    model = joblib.load(path)
    return model

def hyper_rf(X,y,n_splits,n_repeats,class_weight):
    # Create the parameter grid based on the results of random search 
    before = time.time()
    
    param_grid = {
        'bootstrap': [True],
        'max_depth': [5,20,30],
        'max_features': ['log2', 'sqrt'],
        'min_samples_leaf': [3, 5],
        'min_samples_split': [8, 10,20],
        'n_estimators': [50,100,20]
    }
    
    rfc = RandomForestClassifier(class_weight=class_weight)
    # Create a based model
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)
    # grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    random_search = RandomizedSearchCV(estimator=rfc, param_distributions = param_grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    
    # 'average_precision', 'precision_samples','roc_auc_ovo','roc_auc','roc_auc_ovr_weighted','precision_weighted','recall_macro',
    # 'f1_macro','balanced_accuracy','f1_samples'
    
    random_result = random_search.fit(X, y)
    print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
    after = time.time()
    print(after-before,'seconds')

    return random_result

def rf_report(random_result,X,y):
    # Assuming random_result is the result from RandomizedSearchCV
    best_params = random_result.best_params_
    
    # Now use these best parameters to create a new RandomForestClassifier
    rf = RandomForestClassifier(**best_params)
    
    # rf.fit(X, y)
    # rf.fit(X_train, y_train)
    rf.fit(X, y)
    # y_pred4 = rf.predict(X_test_scaled)
    y_pred4 = rf.predict(X_test)
    print(classification_report(y_test, y_pred4))

    return rf

def calculate_shap_values(model, X):
    explainer = shap.Explainer(model)
    shap_values = explainer.shap_values(X)
    return shap_values

def plot_shap_values(shap_values, feature_names):
    # Create the beeswarm plot
    plt.figure(figsize=(6, 3)) # changing the figure size here is not reflecting on the graph
    shap.summary_plot(shap_values, feature_names=feature_names, plot_type='bar')
    plt.show()

def hyper_lgb(X,y):
    before = time.time()
    # Define the LightGBM classifier
    # lgbm_classifier = lgb.LGBMClassifier(objective='binary', random_state=42,is_unbalance=True)
    lgbm_classifier = lgb.LGBMClassifier(objective='binary', random_state=42)
    
    # Update the pipeline with LightGBM classifier
    lgbm_pipeline = Pipeline([
        ('classifier', lgbm_classifier)
    ])
    
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'learning_rate': [0.01, 0.1],
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20],
        'min_child_samples' : [5, 10, 20]
    }
    
    # Grid search with cross-validation
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    #grid_search_lgbm = GridSearchCV(lgbm_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
    random_search_lgbm = RandomizedSearchCV(estimator=lgbm_classifier, param_distributions = param_grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    
    # Fit the grid search model
    # grid_search_lgbm.fit(X_train, y_train)
    random_search_lgbm.fit(X, y)
    
    # Best parameters and F1 score
    best_params_lgbm = random_search_lgbm.best_params_
    best_f1_score_lgbm = random_search_lgbm.best_score_
    
    print("Best Parameters:", best_params_lgbm)
    print("Best F1 Score:", best_f1_score_lgbm)
    
    after = time.time()
    
    print('total time:',after-before)
    return best_params_lgbm

def lgb_report(X,y,best_params_lgbm):
    lgbm_classifier = lgb.LGBMClassifier(**best_params_lgbm)
    
    lgbm_classifier.fit(X, y)
    
    y_pred6 = lgbm_classifier.predict(X_test)
    print(classification_report(y_test, y_pred6))
    return lgbm_classifier


def hyper_xgb(X,y):
    
    param_grid = {
        'n_estimators':[50,100,200],
        'min_child_weight':[1, 3, 7],
        'subsample':[0.6, 0.8, 1.0],
        'max_depth': [4, 8, 10, 20]
    }
    
    # label_encoder = LabelEncoder()
    # y_train_encoded = label_encoder.fit_transform(y_train)
    
    xgb_model = xgb.XGBClassifier()
    
    # Create a based model
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='precision',error_score=0)
    # random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions = param_grid, n_jobs=-1, cv=cv, scoring='f1_weighted',error_score=0)
    
    grid_search_result = grid_search.fit(X, y)
    print("Best: %f using %s" % (grid_search_result.best_score_, grid_search_result.best_params_))
    xgb_best_params = grid_search_result.best_params_
    
    return xgb_best_params

def xgb_report(X,y,xgb_best_params):
    xgb_hyper = xgb.XGBClassifier(**xgb_best_params)
    xgb_hyper.fit(X, y)
    
    
    y_pred_xgb= xgb_hyper.predict(X_test)
    print(classification_report(y_test, y_pred_xgb))
    
    return xgb_hyper

def majority_vote(y_pred_lr, y_pred_rf, y_pred_lgbm,y_pred_xgm,y_auto_preds):
    
    # if len(roc_predictions) != len(lgbm_predictions) or len(roc_predictions) != len(xgb_predictions):
    #     raise ValueError("All input arrays must have the same length")

    majority_array = []
    for lr1, rf1, lgbm1, xgb1, auto1 in zip(y_pred_lr, y_pred_rf, y_pred_lgbm,y_pred_xgm,y_auto_preds):
        
        count_ones = sum([lr1, rf1, lgbm1, xgb1, auto1])
        
        majority_array.append(1 if count_ones >= 3 else 0)

    return majority_array