In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\RameshMisale\Downloads\profile_data")

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65968 entries, 0 to 65967
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   flash_point_flag                65968 non-null  float64
 1   water_reactivity_flag           65968 non-null  int64  
 2   monolith_flag                   65968 non-null  float64
 3   sulfides_reactivity_flag        65968 non-null  int64  
 4   cyanides_reactivity_flag        65968 non-null  int64  
 5   intercompany_flag               65968 non-null  float64
 6   specialpricing_flag             65968 non-null  float64
 7   benzene_waste_flag              65968 non-null  float64
 8   sulfides_flag                   65968 non-null  float64
 9   solid_flag                      65968 non-null  float64
 10  hybrid_flag                     65968 non-null  float64
 11  shock_reactivity_flag           65968 non-null  int64  
 12  nrc_regulated_radioactive_flag  

In [5]:
df.shape

(65968, 34)

In [6]:
df.columns

Index(['flash_point_flag', 'water_reactivity_flag', 'monolith_flag',
       'sulfides_reactivity_flag', 'cyanides_reactivity_flag',
       'intercompany_flag', 'specialpricing_flag', 'benzene_waste_flag',
       'sulfides_flag', 'solid_flag', 'hybrid_flag', 'shock_reactivity_flag',
       'nrc_regulated_radioactive_flag', 'lab_pack_flag', 'gas_flag',
       'directship_flag', 'naics_flag', 'infectious_bio_waste_flag',
       'mgp_flag', 'aerosol_flag', 'pyrophoric_reactivity_flag',
       'sludges_flag', 'pcbs_flag', 'halogens_flag', 'cyanides_flag',
       'pesticides_flag', 'dot_explosive_flag', 'boiling_point_flag',
       'isRecertified', 'labpack_flag', 'national_flag', 'urgent_flag',
       'rush_flag', 'Resubmit_binary'],
      dtype='object')

In [7]:
# Summary Statistics
df.describe()

Unnamed: 0,flash_point_flag,water_reactivity_flag,monolith_flag,sulfides_reactivity_flag,cyanides_reactivity_flag,intercompany_flag,specialpricing_flag,benzene_waste_flag,sulfides_flag,solid_flag,hybrid_flag,shock_reactivity_flag,nrc_regulated_radioactive_flag,lab_pack_flag,gas_flag,directship_flag,naics_flag,infectious_bio_waste_flag,mgp_flag,aerosol_flag,pyrophoric_reactivity_flag,sludges_flag,pcbs_flag,halogens_flag,cyanides_flag,pesticides_flag,dot_explosive_flag,boiling_point_flag,isRecertified,labpack_flag,national_flag,urgent_flag,rush_flag,Resubmit_binary
count,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0,65968.0
mean,2.494376,0.018539,0.000485,0.00047,0.002941,0.006928,0.009247,0.00908,0.000197,0.04246,7.6e-05,9.1e-05,4.5e-05,0.004548,0.001955,0.017812,0.000773,3e-05,0.070504,0.003274,0.000258,0.007686,0.006685,0.539762,0.000333,0.000333,0.001531,0.08448,0.086527,0.006609,0.03009,0.184968,0.078159,0.022359
std,2.469564,0.134892,0.022019,0.021673,0.05415,0.082944,0.095716,0.094857,0.014037,0.201638,0.008706,0.009537,0.006744,0.067283,0.044178,0.132267,0.027794,0.005506,0.255996,0.057128,0.016051,0.08733,0.081489,0.49842,0.018259,0.018259,0.039099,0.398433,0.281143,0.081029,0.170837,0.388275,0.268424,0.14785
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0


---

In [8]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer(norm='l2')
print(normalizer.fit_transform(df))

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.81649658 0.         0.         ... 0.40824829 0.         0.        ]
 [0.         0.         0.         ... 0.57735027 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.70710678 0.         0.         ... 0.70710678 0.         0.        ]
 [0.9701425  0.         0.         ... 0.24253563 0.         0.        ]]


In [9]:
df['Resubmit_binary'].value_counts()

0    64493
1     1475
Name: Resubmit_binary, dtype: int64

In [10]:
# separating the data for analysis
Not_Resubmit = df[df['Resubmit_binary'] == 0]
Resubmit = df[df['Resubmit_binary'] == 1]

In [11]:
# statistical measures of the legit data
Resubmit.describe()

Unnamed: 0,flash_point_flag,water_reactivity_flag,monolith_flag,sulfides_reactivity_flag,cyanides_reactivity_flag,intercompany_flag,specialpricing_flag,benzene_waste_flag,sulfides_flag,solid_flag,hybrid_flag,shock_reactivity_flag,nrc_regulated_radioactive_flag,lab_pack_flag,gas_flag,directship_flag,naics_flag,infectious_bio_waste_flag,mgp_flag,aerosol_flag,pyrophoric_reactivity_flag,sludges_flag,pcbs_flag,halogens_flag,cyanides_flag,pesticides_flag,dot_explosive_flag,boiling_point_flag,isRecertified,labpack_flag,national_flag,urgent_flag,rush_flag,Resubmit_binary
count,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0,1475.0
mean,2.778305,0.008814,0.0,0.001356,0.004068,0.00678,0.015593,0.004068,0.0,0.002712,0.0,0.000678,0.0,0.000678,0.0,0.005424,0.002034,0.0,0.018305,0.0,0.0,0.000678,0.002712,0.399322,0.0,0.002034,0.004746,0.102373,0.388475,0.00339,0.0,0.215593,0.113898,1.0
std,2.432886,0.093498,0.0,0.03681,0.063671,0.082087,0.123937,0.063671,0.0,0.052023,0.0,0.026038,0.0,0.026038,0.0,0.073471,0.045068,0.0,0.134098,0.0,0.0,0.026038,0.052023,0.489925,0.0,0.045068,0.068749,0.42921,0.487569,0.058143,0.0,0.411373,0.317795,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
max,6.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0


In [12]:
# compare the values for both transactions
df.groupby('Resubmit_binary').mean()

Unnamed: 0_level_0,flash_point_flag,water_reactivity_flag,monolith_flag,sulfides_reactivity_flag,cyanides_reactivity_flag,intercompany_flag,specialpricing_flag,benzene_waste_flag,sulfides_flag,solid_flag,hybrid_flag,shock_reactivity_flag,nrc_regulated_radioactive_flag,lab_pack_flag,gas_flag,directship_flag,naics_flag,infectious_bio_waste_flag,mgp_flag,aerosol_flag,pyrophoric_reactivity_flag,sludges_flag,pcbs_flag,halogens_flag,cyanides_flag,pesticides_flag,dot_explosive_flag,boiling_point_flag,isRecertified,labpack_flag,national_flag,urgent_flag,rush_flag
Resubmit_binary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,2.487882,0.018762,0.000496,0.00045,0.002915,0.006931,0.009102,0.009195,0.000202,0.043369,7.8e-05,7.8e-05,4.7e-05,0.004636,0.002,0.018095,0.000744,3.1e-05,0.071698,0.003349,0.000264,0.007846,0.006776,0.542974,0.000341,0.000295,0.001458,0.084071,0.079621,0.006683,0.030779,0.184268,0.077342
1,2.778305,0.008814,0.0,0.001356,0.004068,0.00678,0.015593,0.004068,0.0,0.002712,0.0,0.000678,0.0,0.000678,0.0,0.005424,0.002034,0.0,0.018305,0.0,0.0,0.000678,0.002712,0.399322,0.0,0.002034,0.004746,0.102373,0.388475,0.00339,0.0,0.215593,0.113898


In [13]:
Resubmit_sample = Not_Resubmit.sample(n=1475)
# Concatenating two DataFrames
new_df = pd.concat([Resubmit_sample,Resubmit], axis=0)
new_df

Unnamed: 0,flash_point_flag,water_reactivity_flag,monolith_flag,sulfides_reactivity_flag,cyanides_reactivity_flag,intercompany_flag,specialpricing_flag,benzene_waste_flag,sulfides_flag,solid_flag,hybrid_flag,shock_reactivity_flag,nrc_regulated_radioactive_flag,lab_pack_flag,gas_flag,directship_flag,naics_flag,infectious_bio_waste_flag,mgp_flag,aerosol_flag,pyrophoric_reactivity_flag,sludges_flag,pcbs_flag,halogens_flag,cyanides_flag,pesticides_flag,dot_explosive_flag,boiling_point_flag,isRecertified,labpack_flag,national_flag,urgent_flag,rush_flag,Resubmit_binary
29738,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0
14002,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0
58688,2.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,2.0,0,0.0,0.0,1.0,0.0,0
4474,5.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,1.0,0
4230,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65701,6.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,1
65757,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,1
65784,1.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,1,0.0,0.0,1.0,0.0,1
65862,1.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,1


In [14]:
new_df['Resubmit_binary'].value_counts()

0    1475
1    1475
Name: Resubmit_binary, dtype: int64

In [15]:
df.isnull().sum().sort_values(ascending = False)

flash_point_flag                  0
pesticides_flag                   0
aerosol_flag                      0
pyrophoric_reactivity_flag        0
sludges_flag                      0
pcbs_flag                         0
halogens_flag                     0
cyanides_flag                     0
dot_explosive_flag                0
water_reactivity_flag             0
boiling_point_flag                0
isRecertified                     0
labpack_flag                      0
national_flag                     0
urgent_flag                       0
rush_flag                         0
mgp_flag                          0
infectious_bio_waste_flag         0
naics_flag                        0
directship_flag                   0
gas_flag                          0
lab_pack_flag                     0
nrc_regulated_radioactive_flag    0
shock_reactivity_flag             0
hybrid_flag                       0
solid_flag                        0
sulfides_flag                     0
benzene_waste_flag          

In [16]:
new_df.shape

(2950, 34)

In [17]:
new_df['Resubmit_binary'].value_counts()

0    1475
1    1475
Name: Resubmit_binary, dtype: int64

In [18]:
X = new_df.drop(columns='Resubmit_binary', axis=1)
y = new_df['Resubmit_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(X.shape, X_train.shape, X_test.shape)

(2950, 33) (2360, 33) (590, 33)


In [20]:
y_train.value_counts()

1    1201
0    1159
Name: Resubmit_binary, dtype: int64

In [21]:
y_test.value_counts()  

0    316
1    274
Name: Resubmit_binary, dtype: int64

In [22]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
decision_tree.fit(X_train, y_train)
y_pred_dt_train = decision_tree.predict(X_train)
y_pred_dt_test = decision_tree.predict(X_test)
accuracy_dt_train = accuracy_score(y_train, y_pred_dt_train)
accuracy_dt_test = accuracy_score(y_test, y_pred_dt_test)
print(f'Decision Tree Training Accuracy: {accuracy_dt_train}')
print(f'Decision Tree Testing Accuracy: {accuracy_dt_test}')
# # classification report
# print(classification_report(y_test, y_pred_dt_test))
# print(confusion_matrix(y_test, y_pred_dt_test))

precision_dt_test = precision_score(y_test, y_pred_dt_test)
recall_dt_test = recall_score(y_test, y_pred_dt_test)
f1_dt_test = f1_score(y_test, y_pred_dt_test)

# Display precision, recall, and f1-score
print(f'\nPrecision: {precision_dt_test:.4f}')
print(f'Recall: {recall_dt_test:.4f}')
print(f'F1-Score: {f1_dt_test:.4f}')

Decision Tree Training Accuracy: 0.7199152542372881
Decision Tree Testing Accuracy: 0.7084745762711865

Precision: 0.6281
Recall: 0.9124
F1-Score: 0.7440


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
random_forest = RandomForestClassifier(n_estimators=300, random_state=42,class_weight='balanced')
random_forest.fit(X_train, y_train)

y_pred_rf_train = random_forest.predict(X_train)
y_pred_rf_test = random_forest.predict(X_test)

accuracy_rf_train = accuracy_score(y_train, y_pred_rf_train)
accuracy_rf_test = accuracy_score(y_test, y_pred_rf_test)

print(f'Random Forest Training Accuracy: {accuracy_rf_train}')
print(f'Random Forest Testing Accuracy: {accuracy_rf_test}')
precision_rf_test = precision_score(y_test, y_pred_rf_test)
recall_rf_test = recall_score(y_test, y_pred_rf_test)
f1_rf_test = f1_score(y_test, y_pred_rf_test)

print(f'\nPrecision: {precision_rf_test:.4f}')
print(f'Recall: {recall_rf_test:.4f}')
print(f'F1-Score: {f1_rf_test:.4f}')

Random Forest Training Accuracy: 0.760593220338983
Random Forest Testing Accuracy: 0.7152542372881356

Precision: 0.7103
Recall: 0.6533
F1-Score: 0.6806


In [24]:
import joblib
model = decision_tree
joblib.dump(model, 'decision_tree1.pkl', protocol=4)  # Use a specific protocol version

['decision_tree1.pkl']

In [25]:
import joblib
import pickle
model_case = joblib.load('decision_tree1.pkl')

In [26]:
--

SyntaxError: invalid syntax (3659366440.py, line 1)

In [None]:
model = RandomForestClassifier(random_state=42)

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('model', model)
    ])
    
# Perform cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    
# Calculate mean accuracy
mean_accuracy = scores.mean()
    
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
    
# Make predictions on the test data
y_pred = pipeline.predict(X_test)
    
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
    
print("Model:", RandomForestClassifier())
print("Cross-validation Accuracy:", mean_accuracy)
print("Test Accuracy:", accuracy)
print('Recall Score: ', recall_score(y_test, y_pred))
print('Precision Score: ', precision_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))

best_model = pipeline

In [None]:
LABELS = ['Resubmit', 'Not_Resubmit'] 
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred) 
plt.figure(figsize =(6,6)) 
sns.heatmap(conf_matrix, xticklabels = LABELS, yticklabels = LABELS, annot = True, fmt ="d"); 
plt.title("Confusion matrix") 
plt.ylabel('True class') 
plt.xlabel('Predicted class') 
plt.show()

In [None]:
--

# ***Feature Engineering:***

In [None]:
df_train = new_df.copy()
df_train.head()

In [None]:
#The function display_feature_importance takes a machine learning model (model) as input and performs feature importance analysis.
def display_feature_importance(model,percentage ,top_n=34, plot=False):
    # X and y 
    X = df_train.drop('Resubmit_binary',axis=1)
    y = df_train['Resubmit_binary']
    model.fit(X, y)
    
    # Get feature importance
    feature_importance = model.feature_importances_
    feature_names = X.columns
    
    # Create a DataFrame for better visualization
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    
    # Sort features by importance
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    
    # A threshold is calculated based on a specified percentage of the top feature importance.
    #  Features with importance above this threshold are selected.
    threshold = percentage / 100 * feature_importance_df.iloc[0]['Importance']
    
    # Select features that meet the threshold
    selected_features = feature_importance_df[feature_importance_df['Importance'] >= threshold]['Feature'].tolist()
    
    #Print Selected Feature 
    print("Selected Features by {} \n \n at threshold {}%; {}".format(model , percentage,selected_features))
    
    if plot==True:
        # Set seaborn color palette to "viridis"
        sns.set(style="whitegrid", palette="viridis")
    
        # Display or plot the top features
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(top_n))
        plt.title('Feature Importance for {}'.format(type(model).__name__))
        plt.show()
        
    # Add 'Exited' to the list of selected features
    selected_features.append('Resubmit_binary')
        
    return selected_features

In [None]:
# List to store selected features for each model and trial percentage
selected_features_xgb = []

# Initilize AUC List 
auc_scores = []

# List of trial percentages
trial_percentages = [3, 5, 10, 20, 40]

# Loop over each trial percentage
for percentage in trial_percentages:
        # Get selected features for each model
        xgb_selected_features = display_feature_importance(XGBClassifier(random_state=42), percentage=percentage)

        # Append selected features to the respective lists
        selected_features_xgb.append(xgb_selected_features)
 
        X = df_train.drop('Resubmit_binary',axis=1)
        y = df_train['Resubmit_binary']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Fit models on training data
        xgb_model = XGBClassifier()
        xgb_model.fit(X_train[[feature for feature in xgb_selected_features if feature != 'Resubmit_binary']], y_train, verbose=0)

        # Predict probabilities on the test set
        xgb_pred_proba = xgb_model.predict_proba(X_test[[feature for feature in xgb_selected_features if feature != 'Resubmit_binary']])[:, 1]

        # Calculate AUC scores and append to the list
        from sklearn.metrics import roc_auc_score

        auc_xgb = roc_auc_score(y_test, xgb_pred_proba)
        auc_scores.append((auc_xgb,percentage))

        # Sorted AUC 
        sorted_auc = sorted(auc_scores, reverse=True)

# Print Each AUC with Percentage 
for score , percentage in sorted_auc :
        print(f'The AUC for {type(xgb_model).__name__ , } \n with {percentage}% of top features is {score:.4f}')

In [None]:
--

In [None]:
imp_fea = ['solid_flag', 'isRecertified', 'national_flag', 'dot_explosive_flag', 'halogens_flag', 'mgp_flag', 'boiling_point_flag', 'specialpricing_flag', 'flash_point_flag', 'urgent_flag','Resubmit_binary']
df_train = df_train[imp_fea]
df_train.head()

In [None]:
df_train.shape

# *Important Feature of  Dataset Train RF and XGB with Hyperparameter Tuning:*

- **RF**

In [None]:
def train_random_forest(data, target):
    # Dictionary to store LabelEncoders for each categorical column
    label_encoders = {}

    # split the data into X and y
    X = data.drop(target, axis=1)
    y = data[target]

    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    # # Scaling Data
    # scaler = MinMaxScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)

    # Define the Random Forest model
    rf_model = RandomForestClassifier(random_state=0,class_weight='balanced')

    # Define hyperparameters for tuning
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and parameters
    best_rf_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Print the best hyperparameters
    print('Best Hyperparameters:')
    print(best_params)

    # Train the model on the full training set
    best_rf_model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred_rf = best_rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_rf)
    precision = precision_score(y_test, y_pred_rf)
    recall = recall_score(y_test, y_pred_rf)

    print(f'Accuracy on Test Set: {accuracy:.2f}')
    print(f'Precision on Test Set: {precision:.2f}')
    print(f'Recall on Test Set: {recall:.2f}')
    
    #visulalizing the confusion matrix
    LABELS = ['Normal', 'Fraud'] 
    from sklearn.metrics import confusion_matrix
    conf_matrix = confusion_matrix(y_test, y_pred_rf) 
    plt.figure(figsize =(12, 12)) 
    sns.heatmap(conf_matrix, xticklabels = LABELS, yticklabels = LABELS, annot = True, fmt ="d"); 
    plt.title("Confusion matrix") 
    plt.ylabel('True class') 
    plt.xlabel('Predicted class') 
    plt.show()

    return best_rf_model, best_params, accuracy


In [None]:
train_random_forest(df_train,'Resubmit_binary')

- **XGB**

In [None]:
def train_xgb_classifier(data, target):
    # split the data into X and y
    X = data.drop(target, axis=1)
    y = data[target]

    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    
    # # Scaling Data 
    # scaler = MinMaxScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)

    # Define the XGBClassifier model
    xgb_model = XGBClassifier(random_state=0)

    # Define hyperparameters for tuning
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1, 2]
    }

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and parameters
    best_xgb_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Print the best hyperparameters
    print('Best Hyperparameters:')
    print(best_params)

    # Train the model on the full training set
    best_xgb_model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred_xgb = best_xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_xgb)
    precision = precision_score(y_test, y_pred_xgb)
    recall = recall_score(y_test, y_pred_xgb)

    print(f'Accuracy on Test Set: {accuracy:.2f}')
    print(f'Precision on Test Set: {precision:.2f}')
    print(f'Recall on Test Set: {recall:.2f}')
    
    #visulalizing the confusion matrix
    LABELS = ['Normal', 'Fraud'] 
    from sklearn.metrics import confusion_matrix
    conf_matrix = confusion_matrix(y_test, y_pred_xgb) 
    plt.figure(figsize =(12, 12)) 
    sns.heatmap(conf_matrix, xticklabels = LABELS, yticklabels = LABELS, annot = True, fmt ="d"); 
    plt.title("Confusion matrix") 
    plt.ylabel('True class') 
    plt.xlabel('Predicted class') 
    plt.show()


    return best_xgb_model, best_params

In [None]:
train_xgb_classifier(df_train,'Resubmit_binary')

In [None]:
# Selcting Best and Highest Accuracy from Above trained Models 
# XGb Model Classifier
# Random Forest
models = ['XGB Classifier', 'RandomForestClassifier']
accuracy_scores = [accuracy, accuracy]

# Find the index of the maximum accuracy
best_accuracy_index = accuracy_scores.index(max(accuracy_scores))

# Print the best model for accuracy
print(f'Best Accuracy: {accuracy_scores[best_accuracy_index]:.2f} with Model: {models[best_accuracy_index]}')

---

---