###  Predicting resubmit/returned profiles using machine learning 

Importing all required modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from sklearn import metrics 
pd.set_option('display.max_columns', 0)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import  confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

Loading the dataset using pandas

In [2]:
df = pd.read_excel(r"C:\Users\RameshMisale\Downloads\Latest_profile_data.xlsx")

In [3]:
df.shape

(65968, 278)

In [4]:
df1 = df.copy()

The resubmit stage in datetime format so converting into binary and this is our target variable 

In [5]:
df1['Resubmit'] = pd.to_datetime(df1['Resubmit'])
df1['Resubmit_binary'] = df1['queue_desc'].apply(lambda x: 1 if x=='Resubmit' else 0)

In [6]:
df1['Resubmit_binary'].value_counts() 

0    64493
1     1475
Name: Resubmit_binary, dtype: int64

Filling the null values 

In [7]:
columns_to_fill = ['rcra_non_haz_exempt','halogens_flag','no_reactivity_flag','layered','viscosity','odor_flag','ph_flag',
    'flash_point_flag','boiling_point_flag','btu_per_lbs','pumpable_waste_flag','polymerizable_flag','benzene_waste_flag',
    'voc_100_ppm','marine_pollutant_flag','origin_code','sds_attached','specific_gravity','benzene_section_flag',
    'max_benzene_flag','benzene_water','prohibited_land_disposal','uts_waste','voc_500_ppm','specialpricing_flag',
    'intercompany_flag','mgp_flag','pa_waste_catogory','debris','compressed_gas','analytical_ind',
    'generatorknowledge_ind','sds_ind','formulary_attached','analytical_attached','sample_provided','mgplock_flag',
    'naics_flag','federal_universal_waste','generator_state_universal_waste']
df1[columns_to_fill] = df1[columns_to_fill].fillna(0, inplace=False)

In [8]:
col = ['water_percentage', 'toc_percentage']
df1[col] = df1[col].fillna(df1[col].mean(), inplace=False)

Dropping the object and datatime columns

In [9]:
object_columns = df1.select_dtypes(include=['object','datetime64']).columns
df_dropped_objects = df1.drop(object_columns, axis=1)
print(object_columns)

Index(['profile_number', 'profile_name', 'description', 'status_desc',
       'queue_desc', 'Initiated', 'HistoryInitiated', 'Submitted', 'Assigned',
       'Resubmit', 'ReadyForGenSign', 'SentForGenSign', 'DocSignReturned',
       'Approved', 'vendor_name', 'OriginStatus', 'SourceSystem',
       'GeneratorName', 'Customer Service Rep', 'CustomerName', 'Salesrep',
       'SalesrepEmail', 'profile_number.1', 'profile_name.1',
       'rcra_process_generating', 'pcbs_ppm', 'cyanides_ppm', 'sulfides_ppm',
       'pesticides_ppm', 'dioxins_ppm', 'halogens_pct', 'other_reactivity',
       'solid_pct', 'sludges_pct', 'free_liquids_pct', 'dust_pct', 'odor_desc',
       'color_desc', 'density', 'ph_exact_value', 'comments',
       'record_create_datetime', 'approval_letter_notes',
       'helper_approval_date', 'helper_recert_needed_date',
       'helper_last_received_date', 'helper_expired_date', 'approval_date',
       'helper_first_received_date', 'bulk_sample_anniversary_date',
       'outb

Feature selection dropping constant feature

In [10]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(df_dropped_objects)

  self.variances_ = np.nanvar(X, axis=0)
  self.variances_ = np.nanmin(compare_arr, axis=0)


Printing the columns with their values which are constants 

In [11]:
constant_columns = [column for column in df_dropped_objects.columns
                    if column not in df_dropped_objects.columns[var_thres.get_support()]]
print(f"Number of constant columns: {len(constant_columns)}")
for feature in constant_columns:
    unique_values = df_dropped_objects[feature].unique()
    print(f"Constant column: {feature}")
    print(f"Unique values: {unique_values}")
    print("\n")


Number of constant columns: 17
Constant column: phenolics_ppm
Unique values: [nan  0.]


Constant column: minimum_packaging_requirements
Unique values: [nan]


Constant column: outbound_profile_taxes
Unique values: [nan]


Constant column: inbound_oubound_id_xref
Unique values: [nan]


Constant column: ky_report_physical_state_ind
Unique values: [nan]


Constant column: ky_report_onsite_ind
Unique values: [nan]


Constant column: contract_id
Unique values: [nan]


Constant column: sic_code
Unique values: [nan]


Constant column: pet_chem_flag
Unique values: [nan  0.]


Constant column: pet_chem_actual
Unique values: [nan]


Constant column: cokeoven_flag
Unique values: [ 0. nan]


Constant column: HCSId
Unique values: [ 0. nan]


Constant column: container_size_flag
Unique values: [ 1. nan]


Constant column: waste_type_id
Unique values: [nan]


Constant column: highly_toxic_flag
Unique values: [nan  0.]


Constant column: incin_prep_flag
Unique values: [nan  1.]


Constant column: no_

In [12]:
data = df_dropped_objects.drop(constant_columns,axis=1) 

Feature_selection by correlation and dropped the features which are having >80% correlation with each other

In [13]:
def correlation(dataset,threshold):
    col_corr=set()
    corr_matrix=dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]>threshold):
                colname=corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [14]:
corr_features=correlation(data,0.8)
len(set(corr_features))
print(corr_features)

{'use_eei_rate', 'infectious_flag', 'organic_peroxide_flag', 'no_reactivity_flag', 'CustomerId.1', 'waste_water_flag', 'dust_hazard_flag', 'pesticide_herbicide_flag', 'unused_commercial_product_flag', 'asbestos_friable_flag', 'ProfiledTonnage', 'formulary_attached', 'standard_flag.1', 'cercla_flag', 'JobID', 'layered', 'puso_flag', 'rush_flag.1', 'vendor_id.1', 'isRecertified.1', 'outbound_vendor_id.1', 'subpart_p_flag', 'flammable_solid_flag', 'web_profile_number.1', 'outbound_profile_ind.1', 'presignedstatus', 'radioactive_flag', 'cylinder_flag', 'oxidizer_flag', 'urgent_flag.1', 'metals_flag', 'initial_revision_profile_id', 'OracleLocationCode', 'ppe_code_id', 'ammonia_flag', 'spill_residue_flag', 'company_id', 'RevenueAssignedToVendorId', 'profile_id.1', 'btu_per_lbs', 'none_flag', 'pfas_pfoa_flag', 'loosepack_flag', 'rcra_debris_flag', 'copied_from_Id.1', 'ph_flag', 'oral_toxicity_flag_liquid', 'medical_sharps_needles_flag', 'bulk_solid_flag', 'subject_to_subpart_cc_flag', 'asbest

In [15]:
data1=data.drop(corr_features,axis=1)

Dropping the columns which are having >90 % of null values

In [16]:
threshold_percentage = 90 
null_percentage = (data1.isnull().sum() / len(data1)) * 100
columns_to_drop = null_percentage[null_percentage > threshold_percentage].index
data1_dropped = data1.drop(columns=columns_to_drop)
#data1_dropped.isnull().sum()

In [17]:
data1_dropped = data1_dropped.fillna(0) 

In [18]:
df_final = data1_dropped.drop(columns=['DaysAssignReadyForGenSign','profile_id','ReturnCount',
                                  'DaysSubmitToAssign','DaysAssignReadyForGenSign','DaysReadyForGenSignSentForGenSign',
                            'container_type_id','DaysInitiatedToSubmitted',
'vendor_id','ContractID','ldr_class_id','CustomerId','CollectionId','DaysDocSignReturnedToApproved',
'Recert','is_template_profile_flag','status_code_id','source_code_id','form_code_id',
'management_method_code_id','outbound_profile_id','price_type_code_id','health_chemical_identity_id',
'flammability_chemical_identity_id','reactivity_chemical_identity_id','process_code_id',
'SalesrepID','InternalCoordinatorID','MarketDriverID','InsideSalesRepID','requested_process_code_id'],axis=1)
df_final.shape

(65968, 86)

In [19]:
df_final.columns

Index(['web_profile_number', 'DaysSentForGenSignToDocSignReturned', 'IsHaz',
       'outbound_profile_ind', 'isRecertified', 'standard_flag', 'rush_flag',
       'urgent_flag', 'rcra_non_haz_exempt', 'pcbs_flag', 'cyanides_flag',
       'sulfides_flag', 'pesticides_flag', 'halogens_flag',
       'infectious_bio_waste_flag', 'nrc_regulated_radioactive_flag',
       'pyrophoric_reactivity_flag', 'cyanides_reactivity_flag',
       'sulfides_reactivity_flag', 'water_reactivity_flag',
       'shock_reactivity_flag', 'dot_explosive_flag', 'gas_flag',
       'aerosol_flag', 'lab_pack_flag', 'monolith_flag', 'solid_flag',
       'sludges_flag', 'free_liquids_flag', 'dust_flag', 'viscosity',
       'odor_flag', 'flash_point_flag', 'boiling_point_flag',
       'poisonous_by_inhalation_flag', 'vented_drums_flag',
       'pumpable_waste_flag', 'polymerizable_flag', 'benzene_waste_flag',
       'ozone_depleting_flag', 'large_scape_pieces_flag', 'voc_100_ppm',
       'marine_pollutant_flag', 'parent

In [20]:
selected_columns = [
    'flash_point_flag', 'water_reactivity_flag', 'monolith_flag', 'sulfides_reactivity_flag', 'cyanides_reactivity_flag',
    'intercompany_flag', 'specialpricing_flag', 'benzene_waste_flag', 'sulfides_flag', 'solid_flag', 'hybrid_flag',
    'shock_reactivity_flag', 'nrc_regulated_radioactive_flag', 'lab_pack_flag', 'gas_flag', 'directship_flag',
    'naics_flag', 'infectious_bio_waste_flag', 'mgp_flag', 'aerosol_flag', 'pyrophoric_reactivity_flag', 'sludges_flag',
    'parent_profile_id', 'pcbs_flag', 'halogens_flag', 'cyanides_flag', 'pesticides_flag', 'dot_explosive_flag',
    'boiling_point_flag', 'isRecertified', 'labpack_flag', 'national_flag'
]

new_dataframe = df_final[selected_columns + ['Resubmit_binary']]

new_dataframe.head()


Unnamed: 0,flash_point_flag,water_reactivity_flag,monolith_flag,sulfides_reactivity_flag,cyanides_reactivity_flag,intercompany_flag,specialpricing_flag,benzene_waste_flag,sulfides_flag,solid_flag,hybrid_flag,shock_reactivity_flag,nrc_regulated_radioactive_flag,lab_pack_flag,gas_flag,directship_flag,naics_flag,infectious_bio_waste_flag,mgp_flag,aerosol_flag,pyrophoric_reactivity_flag,sludges_flag,parent_profile_id,pcbs_flag,halogens_flag,cyanides_flag,pesticides_flag,dot_explosive_flag,boiling_point_flag,isRecertified,labpack_flag,national_flag,Resubmit_binary
0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0,0.0,0,0.0,0.0,0
1,2.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,464539.0,0,0.0,0.0,0.0,0,0.0,1,0.0,0.0,0
2,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,560846.0,0,1.0,0.0,0.0,0,0.0,1,0.0,0.0,0
3,5.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,560845.0,0,1.0,0.0,0.0,0,0.0,1,0.0,0.0,0
4,4.0,0,0.0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,329151.0,0,0.0,0.0,0.0,0,2.0,1,0.0,0.0,0


In [21]:
new_dataframe.shape

(65968, 33)

In [22]:
X = new_dataframe.drop('Resubmit_binary', axis=1)
y = new_dataframe['Resubmit_binary']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
print(df_final['Resubmit_binary'].value_counts())


0    64493
1     1475
Name: Resubmit_binary, dtype: int64


Feature selection using entropy typically involves selecting features based on their information gain or entropy.

# Logistic Regression

In [24]:
# X = new_dataframe.drop('Resubmit_binary', axis=1)
# y = new_dataframe['Resubmit_binary']
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import precision_score, recall_score, f1_score
# logistic_regression = LogisticRegression()
# logistic_regression.fit(X_train, y_train)
# y_pred_lr_train = logistic_regression.predict(X_train)
# y_pred_lr_test = logistic_regression.predict(X_test)

# # Accuracy
# accuracy_lr_train = accuracy_score(y_train, y_pred_lr_train)
# accuracy_lr_test = accuracy_score(y_test, y_pred_lr_test)
# print(f'Logistic Regression Training Accuracy: {accuracy_lr_train}')
# print(f'Logistic Regression Testing Accuracy: {accuracy_lr_test}')
# # classification report
# # print(classification_report(y_test, y_pred_lr_test))
# # print(confusion_matrix(y_test, y_pred_lr_test)) 

# precision_lr_test = precision_score(y_test, y_pred_lr_test)
# recall_lr_test = recall_score(y_test, y_pred_lr_test)
# f1_lr_test = f1_score(y_test, y_pred_lr_test)

# # Display precision, recall, and f1-score
# print(f'\nPrecision: {precision_lr_test:.4f}')
# print(f'Recall: {recall_lr_test:.4f}')
# print(f'F1-Score: {f1_lr_test:.4f}')



# DT

In [25]:
# from sklearn.tree import DecisionTreeClassifier
# decision_tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
# decision_tree.fit(X_train, y_train)
# y_pred_dt_train = decision_tree.predict(X_train)
# y_pred_dt_test = decision_tree.predict(X_test)
# accuracy_dt_train = accuracy_score(y_train, y_pred_dt_train)
# accuracy_dt_test = accuracy_score(y_test, y_pred_dt_test)
# print(f'Decision Tree Training Accuracy: {accuracy_dt_train}')
# print(f'Decision Tree Testing Accuracy: {accuracy_dt_test}')
# # # classification report
# # print(classification_report(y_test, y_pred_dt_test))
# # print(confusion_matrix(y_test, y_pred_dt_test))

# precision_dt_test = precision_score(y_test, y_pred_dt_test)
# recall_dt_test = recall_score(y_test, y_pred_dt_test)
# f1_dt_test = f1_score(y_test, y_pred_dt_test)

# # Display precision, recall, and f1-score
# print(f'\nPrecision: {precision_dt_test:.4f}')
# print(f'Recall: {recall_dt_test:.4f}')
# print(f'F1-Score: {f1_dt_test:.4f}')

In [26]:
# from sklearn.naive_bayes import GaussianNB
# naive_bayes = GaussianNB()
# naive_bayes.fit(X_train, y_train)
# y_pred_nb_train = naive_bayes.predict(X_train)
# y_pred_nb_test = naive_bayes.predict(X_test)

# # Accuracy
# accuracy_nb_train = accuracy_score(y_train, y_pred_nb_train)
# accuracy_nb_test = accuracy_score(y_test, y_pred_nb_test)
# print(f'Naive Bayes Training Accuracy: {accuracy_nb_train:.4f}')
# print(f'Naive Bayes Testing Accuracy: {accuracy_nb_test:.4f}')

# # Precision, Recall, and F1-Score
# precision_nb_test = precision_score(y_test, y_pred_nb_test)
# recall_nb_test = recall_score(y_test, y_pred_nb_test)
# f1_nb_test = f1_score(y_test, y_pred_nb_test)

# # Display precision, recall, and f1-score
# print(f'\nPrecision: {precision_nb_test:.4f}')
# print(f'Recall: {recall_nb_test:.4f}')
# print(f'F1-Score: {f1_nb_test:.4f}')


In [27]:
# from sklearn.neighbors import KNeighborsClassifier
# k_neighbors = 7
# knn_classifier = KNeighborsClassifier(n_neighbors=k_neighbors)
# knn_classifier.fit(X_train, y_train)
# y_pred_knn_train = knn_classifier.predict(X_train)

# # Predictions on the test set
# y_pred_knn_test = knn_classifier.predict(X_test)

# # Accuracy
# accuracy_knn_train = accuracy_score(y_train, y_pred_knn_train)
# accuracy_knn_test = accuracy_score(y_test, y_pred_knn_test)
# print(f'KNN Training Accuracy: {accuracy_knn_train:.4f}')
# print(f'KNN Testing Accuracy: {accuracy_knn_test:.4f}')

# # Precision, Recall, and F1-Score
# precision_knn_test = precision_score(y_test, y_pred_knn_test)
# recall_knn_test = recall_score(y_test, y_pred_knn_test)
# f1_knn_test = f1_score(y_test, y_pred_knn_test)

# # Display precision, recall, and f1-score
# print(f'\nPrecision: {precision_knn_test:.4f}')
# print(f'Recall: {recall_knn_test:.4f}')
# print(f'F1-Score: {f1_knn_test:.4f}')


In [28]:
# from sklearn.ensemble import IsolationForest
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# isolation_forest_model = IsolationForest(contamination=0.01, random_state=42)
# isolation_forest_model.fit(X_train)
# y_pred_rf_train = isolation_forest_model.predict(X_train)
# y_pred_rf_test = isolation_forest_model.predict(X_test)
# # Convert predictions to binary labels (1 for outliers, -1 for inliers)
# y_pred_rf_train_binary = [1 if pred == -1 else 0 for pred in y_pred_rf_train]
# y_pred_rf_test_binary = [1 if pred == -1 else 0 for pred in y_pred_rf_test]
# accuracy_rf_train = accuracy_score(y_train, y_pred_rf_train_binary)
# accuracy_rf_test = accuracy_score(y_test, y_pred_rf_test_binary)
# print(f'Isolation Forest Training Accuracy: {accuracy_rf_train}')
# print(f'Isolation Forest Testing Accuracy: {accuracy_rf_test}')

# precision_rf_test = precision_score(y_test, y_pred_rf_test_binary)
# recall_rf_test = recall_score(y_test, y_pred_rf_test_binary)
# f1_rf_test = f1_score(y_test, y_pred_rf_test_binary)

# print(f'\nPrecision: {precision_rf_test:.4f}')
# print(f'Recall: {recall_rf_test:.4f}')
# print(f'F1-Score: {f1_rf_test:.4f}')


# Under Sampling

In [29]:
new_dataframe.shape

(65968, 33)

In [30]:
XX = new_dataframe.drop('Resubmit_binary', axis='columns')
yy = new_dataframe['Resubmit_binary']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
count_0_class,count_1_class = new_dataframe.Resubmit_binary.value_counts()
df_class_0 = new_dataframe[new_dataframe["Resubmit_binary"]==0]
df_class_1 = new_dataframe[new_dataframe["Resubmit_binary"]==1]
XX_train, XX_test, yy_train, yy_test = train_test_split(XX,yy, test_size=0.2,random_state=15,stratify=yy)

In [32]:
# count_0_class_under = int(count_0_class*0.8)
# df_class_0_under = df_class_0.sample(count_0_class_under)
# df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

# print('Random Under-sampling:')
# print(df_test_under.Resubmit_binary.value_counts())

In [33]:
# df_class_0_under = df_class_0.sample(count_1_class)
# df_test_under = pd.concat([df_class_0_under, df_class_1],axis=0)
# print('Random Under-sampling:')
# print(df_test_under.Resubmit_binary.value_counts())

In [34]:
# XX = df_test_under.drop('Resubmit_binary', axis='columns')
# yy= df_test_under['Resubmit_binary']
# #XX_train, XX_test, yy_train, yy_test = train_test_split(XX,yy, test_size=0.2,random_state=42,stratify=yy)

In [35]:
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size=0.2, random_state=15, stratify=yy)

df_class_0_train = XX_train[yy_train == 0]
df_class_1_train = XX_train[yy_train == 1]
df_class_0_under_train = df_class_0_train.sample(n=len(df_class_1_train), replace=True, random_state=42)
XX_train_under = pd.concat([df_class_0_under_train, df_class_1_train], axis=0)
yy_train_under = yy_train.loc[XX_train_under.index]

print('Random Under-sampling for Training Data:')
print(yy_train_under.value_counts())


Random Under-sampling for Training Data:
0    1180
1    1180
Name: Resubmit_binary, dtype: int64


In [36]:
XX_train_under.shape

(2360, 32)

In [37]:
yy_test.value_counts()

0    12899
1      295
Name: Resubmit_binary, dtype: int64

# Decision_tree

In [38]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import precision_score, recall_score, f1_score
# logistic_regression = LogisticRegression()
# logistic_regression.fit(XX_train_under, yy_train_under)
# y_pred_lr_train = logistic_regression.predict(XX_train_under)
# y_pred_lr_test = logistic_regression.predict(XX_test)
# accuracy_lr_train = accuracy_score(yy_train_under, y_pred_lr_train)
# accuracy_lr_test = accuracy_score(yy_test, y_pred_lr_test)
# print(f'Logistic Regression Training Accuracy: {accuracy_lr_train}')
# print(f'Logistic Regression Testing Accuracy: {accuracy_lr_test}')
# precision_lr_test = precision_score(yy_test, y_pred_lr_test)
# recall_lr_test = recall_score(yy_test, y_pred_lr_test)
# f1_lr_test = f1_score(yy_test, y_pred_lr_test)

# print(f'\nPrecision: {precision_lr_test:.4f}')
# print(f'Recall: {recall_lr_test:.4f}')
# print(f'F1-Score: {f1_lr_test:.4f}')

# # cv_scores = cross_val_score(logistic_regression, XX_train, yy_train, cv=10) 
# # print("Cross-Validation Scores:", cv_scores)

Logistic Regression Training Accuracy: 0.5800847457627119
Logistic Regression Testing Accuracy: 0.5652569349704412

Precision: 0.0288
Recall: 0.5627
F1-Score: 0.0547


In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

decision_tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
decision_tree.fit(XX_train_under, yy_train_under)
y_pred_dt_train = decision_tree.predict(XX_train_under)
y_pred_dt_test = decision_tree.predict(XX_test)
accuracy_dt_train = accuracy_score(yy_train_under, y_pred_dt_train)
accuracy_dt_test = accuracy_score(yy_test, y_pred_dt_test)
print(f'Decision Tree Training Accuracy: {accuracy_dt_train}')
print(f'Decision Tree Testing Accuracy: {accuracy_dt_test}')
# # classification report
# print(classification_report(y_test, y_pred_dt_test))
# print(confusion_matrix(y_test, y_pred_dt_test))

precision_dt_test = precision_score(yy_test, y_pred_dt_test)
recall_dt_test = recall_score(yy_test, y_pred_dt_test)
f1_dt_test = f1_score(yy_test, y_pred_dt_test)

# Display precision, recall, and f1-score
print(f'\nPrecision: {precision_dt_test:.4f}')
print(f'Recall: {recall_dt_test:.4f}')
print(f'F1-Score: {f1_dt_test:.4f}')

# cv_scores = cross_val_score(decision_tree, XX_train, yy_train, cv=10) 
# print("Cross-Validation Scores:", cv_scores)

Decision Tree Training Accuracy: 0.7741525423728813
Decision Tree Testing Accuracy: 0.6609822646657572

Precision: 0.0552
Recall: 0.8780
F1-Score: 0.1038


In [41]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(yy_test, y_pred_dt_test)
tn, fp, fn, tp = conf_matrix.ravel()

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print(f'Sensitivity (Recall): {sensitivity:.4f}')
print(f'Specificity: {specificity:.4f}')


Sensitivity (Recall): 0.8780
Specificity: 0.6560


In [None]:
# from sklearn.naive_bayes import GaussianNB
# naive_bayes = GaussianNB()
# naive_bayes.fit(XX_train_under, yy_train_under)
# y_pred_nb_train = naive_bayes.predict(XX_train_under)
# y_pred_nb_test = naive_bayes.predict(XX_test)

# # Accuracy
# accuracy_nb_train = accuracy_score(yy_train_under, y_pred_nb_train)
# accuracy_nb_test = accuracy_score(yy_test, y_pred_nb_test)
# print(f'Naive Bayes Training Accuracy: {accuracy_nb_train:.4f}')
# print(f'Naive Bayes Testing Accuracy: {accuracy_nb_test:.4f}')

# precision_nb_test = precision_score(yy_test, y_pred_nb_test)
# recall_nb_test = recall_score(yy_test, y_pred_nb_test)
# f1_nb_test = f1_score(yy_test, y_pred_nb_test)

# print(f'\nPrecision: {precision_nb_test:.4f}')
# print(f'Recall: {recall_nb_test:.4f}')
# print(f'F1-Score: {f1_nb_test:.4f}')


In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# k_neighbors = 5
# knn_classifier = KNeighborsClassifier(n_neighbors=k_neighbors)
# knn_classifier.fit(XX_train_under, yy_train_under)
# y_pred_knn_train = knn_classifier.predict(XX_train_under)

# # Predictions on the test set
# y_pred_knn_test = knn_classifier.predict(XX_test)

# # Accuracy
# accuracy_knn_train = accuracy_score(yy_train_under, y_pred_knn_train)
# accuracy_knn_test = accuracy_score(yy_test, y_pred_knn_test)
# print(f'KNN Training Accuracy: {accuracy_knn_train:.4f}')
# print(f'KNN Testing Accuracy: {accuracy_knn_test:.4f}')

# precision_knn_test = precision_score(yy_test, y_pred_knn_test)
# recall_knn_test = recall_score(yy_test, y_pred_knn_test)
# f1_knn_test = f1_score(yy_test, y_pred_knn_test)

# print(f'\nPrecision: {precision_knn_test:.4f}')
# print(f'Recall: {recall_knn_test:.4f}')
# print(f'F1-Score: {f1_knn_test:.4f}')


In [None]:
# from xgboost import XGBClassifier
# import xgboost
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# xgb_model = XGBClassifier()
# xgb_model.fit(XX_train_under, yy_train_under)

# y_pred_xgb_train = xgb_model.predict(XX_train_under)
# y_pred_xgb_test = xgb_model.predict(XX_test)

# accuracy_xgb_train = accuracy_score(yy_train_under, y_pred_xgb_train)
# accuracy_xgb_test = accuracy_score(yy_test, y_pred_xgb_test)

# print(f'XGBoost Training Accuracy: {accuracy_xgb_train}')
# print(f'XGBoost Testing Accuracy: {accuracy_xgb_test}')

# precision_xgb_test = precision_score(yy_test, y_pred_xgb_test)
# recall_xgb_test = recall_score(yy_test, y_pred_xgb_test)
# f1_xgb_test = f1_score(yy_test, y_pred_xgb_test)

# print(f'\nPrecision (XGBoost): {precision_xgb_test:.4f}')
# print(f'Recall (XGBoost): {recall_xgb_test:.4f}')
# print(f'F1-Score (XGBoost): {f1_xgb_test:.4f}')


# # cv_scores = cross_val_score(xgb_model, XX_train, yy_train, cv=10) 
# # print("Cross-Validation Scores:", cv_scores)

In [42]:
import joblib
model = decision_tree
joblib.dump(model, 'decision_tree.pkl', protocol=4)  # Use a specific protocol version

['decision_tree.pkl']

In [43]:
import joblib
model_case = joblib.load('decision_tree.pkl')

In [None]:
#X_top30.to_csv('your_file_path.csv', index=False)
#new_dataframe.to_csv(r'C:\Users\RameshMisale\Downloads\new_dataframe1.csv', index=False)

# Gaussian Naive Bayes classifier

In [None]:
# from sklearn.naive_bayes import GaussianNB
# naive_bayes = GaussianNB()
# naive_bayes.fit(XX_train, yy_train)
# y_pred_nb_train = naive_bayes.predict(XX_train)
# y_pred_nb_test = naive_bayes.predict(XX_test)

# # Accuracy
# accuracy_nb_train = accuracy_score(yy_train, y_pred_nb_train)
# accuracy_nb_test = accuracy_score(yy_test, y_pred_nb_test)
# print(f'Naive Bayes Training Accuracy: {accuracy_nb_train:.4f}')
# print(f'Naive Bayes Testing Accuracy: {accuracy_nb_test:.4f}')

# # Precision, Recall, and F1-Score
# precision_nb_test = precision_score(yy_test, y_pred_nb_test)
# recall_nb_test = recall_score(yy_test, y_pred_nb_test)
# f1_nb_test = f1_score(yy_test, y_pred_nb_test)

# # Display precision, recall, and f1-score
# print(f'\nPrecision: {precision_nb_test:.4f}')
# print(f'Recall: {recall_nb_test:.4f}')
# print(f'F1-Score: {f1_nb_test:.4f}')


# KNeighborsClassifier

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# k_neighbors = 5
# knn_classifier = KNeighborsClassifier(n_neighbors=k_neighbors)
# knn_classifier.fit(XX_train, yy_train)
# y_pred_knn_train = knn_classifier.predict(XX_train)

# # Predictions on the test set
# y_pred_knn_test = knn_classifier.predict(XX_test)

# # Accuracy
# accuracy_knn_train = accuracy_score(yy_train, y_pred_knn_train)
# accuracy_knn_test = accuracy_score(yy_test, y_pred_knn_test)
# print(f'KNN Training Accuracy: {accuracy_knn_train:.4f}')
# print(f'KNN Testing Accuracy: {accuracy_knn_test:.4f}')

# # Precision, Recall, and F1-Score
# precision_knn_test = precision_score(yy_test, y_pred_knn_test)
# recall_knn_test = recall_score(yy_test, y_pred_knn_test)
# f1_knn_test = f1_score(yy_test, y_pred_knn_test)

# # Display precision, recall, and f1-score
# print(f'\nPrecision: {precision_knn_test:.4f}')
# print(f'Recall: {recall_knn_test:.4f}')
# print(f'F1-Score: {f1_knn_test:.4f}')
