In [5]:
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import plotly_express as px


In [2]:
final_data=joblib.load('modified_adult_data')
final_data.sample(5)

Unnamed: 0,age,fnlwgt,education-num,marital-status,relationship,race,sex,capital-gain,capital-loss,hours-per-week,salary,workclass,occupation,country
10758,63,236338,9,Married-civ-spouse,Wife,White,Female,0,0,35,0,Private,Sales,United-States
16947,34,34848,10,Married-civ-spouse,Husband,White,Male,4064,0,40,0,Private,Transport-moving,United-States
3960,43,174575,10,Divorced,Unmarried,White,Male,0,0,45,0,Private,Prof-specialty,United-States
32436,37,40955,15,Married-civ-spouse,Husband,White,Male,0,0,50,1,Private,Prof-specialty,United-States
8724,52,177727,6,Married-civ-spouse,Husband,White,Male,4064,0,45,0,Self-emp-inc,Sales,United-States


In [3]:
final_data.duplicated().sum() # removed if duplicates are present
final_data.drop_duplicates(inplace=True)

In [4]:
final_data.isna().sum()  # dropped if nan values are present
final_data.dropna(inplace=True)

In [5]:
for col in final_data.select_dtypes(exclude='int64').columns:
   a=final_data[col].unique()  # no presence of ' ?' in the data
   print(a)

[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
[' Male' ' Female']
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']
[' United-States' ' Cuba' ' Jamaica' ' India' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' Laos' ' England' ' Canada' ' Germany'
 ' Iran' ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia'
 ' Thailand' ' Ecuador' ' Taiwan' ' Haiti' ' Portugal'
 ' Dominican-Republic' ' El-Salvador' ' France' ' Guatemala' ' China'
 '

In [6]:
#split the input and output data
x=final_data.drop(['salary'],axis=1)
y=final_data[['salary']]

#fetching category and numeric column varibles
category_data=x.select_dtypes(include='object').columns
numeric_data=x.select_dtypes(exclude='object').columns

In [7]:
x[category_data].sample(5)
for col in category_data:
    print(col,x[col].nunique())

marital-status 7
relationship 6
race 5
sex 2
workclass 8
occupation 14
country 41


In [8]:
#using a list comprehension combined with the strip() method, which removes any leading and trailing whitespace from a string.
#applymap: This is a pandas DataFrame method that applies a function to every element in the DataFrame. 
# It is different from apply, which applies a function along an axis (either rows or columns), and applymap works element-wise.

#removed whitespaces of string data 
x[category_data]=x[category_data].applymap(lambda x: x.strip() if isinstance(x,str) else x)
x['country'].unique()

  x[category_data]=x[category_data].applymap(lambda x: x.strip() if isinstance(x,str) else x)


array(['United-States', 'Cuba', 'Jamaica', 'India', 'Mexico', 'South',
       'Puerto-Rico', 'Honduras', 'Laos', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [15]:
# Initialise the OneHotEncoder instance
encoder_instance1 = OneHotEncoder(drop='first')

# here we also have to fit the instance so there we wont face issue in getting feature names properly

# pipeline 
pipeline_p = Pipeline(steps=[('onehot', encoder_instance1.fit(x[category_data]))])

# applying column transformer 
preprocessor1_p = ColumnTransformer(transformers=[('cat_encoding', pipeline_p, category_data)])

# fit the pipeline to the training data 
fit_preprocessor_p = preprocessor1_p.fit(x[category_data])

# dump the preprocessing model
joblib.dump(fit_preprocessor_p, "1hot_model_p.joblib")

# Transform the data
transformed_data1_p = fit_preprocessor_p.transform(x[category_data]).toarray()

# Get feature names
feature_names1_p = encoder_instance1.get_feature_names_out(category_data)

# Create a DataFrame with the transformed data and feature names
encoded_data1_p = pd.DataFrame(transformed_data1_p, columns=feature_names1_p)
encoded_data1_p


Unnamed: 0,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,...,country_Portugal,country_Puerto-Rico,country_Scotland,country_South,country_Taiwan,country_Thailand,country_Trinadad&Tobago,country_United-States,country_Vietnam,country_Yugoslavia
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32531,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32532,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32533,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
for col in category_data:
    a=x[col].nunique()
    print(f'the {col} has {a}\n')

the marital-status has 7

the relationship has 6

the race has 5

the sex has 2

the workclass has 8

the occupation has 14

the country has 41



In [16]:
# Initialise the OneHotEncoder instance
encoder_instance1 = OneHotEncoder(drop='first')

# here we also have to fit the instance so there we  ]wont face issue in getting feature names properly

# pipeline 
pipeline1 = Pipeline(steps=[('onehot', encoder_instance1.fit(x[category_data]))])

# applying column transformer 
preprocessor1 = ColumnTransformer(transformers=[('cat_encoding', pipeline1, category_data)])

# fit the pipeline to the training data 
fit_preprocessor1 = preprocessor1.fit(x[category_data])

# dump the preprocessing model
joblib.dump(fit_preprocessor1, "1hot_model")

# Transform the data
transformed_data1 = fit_preprocessor1.transform(x[category_data]).toarray()

# Get feature names
feature_names1 = encoder_instance1.get_feature_names_out(category_data)

# Create a DataFrame with the transformed data and feature names
encoded_data1 = pd.DataFrame(transformed_data1, columns=feature_names1)
encoded_data1 


Unnamed: 0,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,...,country_Portugal,country_Puerto-Rico,country_Scotland,country_South,country_Taiwan,country_Thailand,country_Trinadad&Tobago,country_United-States,country_Vietnam,country_Yugoslavia
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32531,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32532,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32533,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
category_data.nunique()#7

category_data

Index(['marital-status', 'relationship', 'race', 'sex', 'workclass',
       'occupation', 'country'],
      dtype='object')

In [13]:
x[numeric_data].describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32535.0,32535.0,32535.0,32535.0,32535.0,32535.0
mean,38.586783,189782.9,10.081943,1078.510035,87.373598,40.44174
std,13.637488,105559.3,2.571612,7388.179666,403.113641,12.345943
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117821.5,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,236993.5,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [14]:

for col in x[numeric_data].columns:
    px.histogram(x[numeric_data][col]).show()

KeyboardInterrupt: 

In [None]:
for col in x[category_data].columns:
    px.histogram(x[category_data][col]).show()

In [17]:
# scale the numeric data 

from sklearn.preprocessing import MinMaxScaler

scale_instance=MinMaxScaler()
scale_pipeline=Pipeline(steps=[('scale',scale_instance)])
scaled_fit_model=scale_pipeline.fit(x[numeric_data])
#save the pipeline
joblib.dump(scaled_fit_model,'scale_model')

# transform the numeric data 

scaled_data=pd.DataFrame(scaled_fit_model.transform(x[numeric_data]),columns=numeric_data)
scaled_data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32535.0,32535.0,32535.0,32535.0,32535.0,32535.0
mean,0.295709,0.120548,0.605463,0.010785,0.020058,0.402467
std,0.186815,0.071691,0.171441,0.073883,0.092542,0.125979
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.071676,0.533333,0.0,0.0,0.397959
50%,0.273973,0.112788,0.6,0.0,0.0,0.397959
75%,0.424658,0.152612,0.733333,0.0,0.0,0.44898
max,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
x_final=pd.concat([scaled_data,encoded_data1],axis=1)
x_final

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,...,country_Portugal,country_Puerto-Rico,country_Scotland,country_South,country_Taiwan,country_Thailand,country_Trinadad&Tobago,country_United-States,country_Vietnam,country_Yugoslavia
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,0.136986,0.166404,0.733333,0.000000,0.0,0.377551,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32531,0.315068,0.096500,0.533333,0.000000,0.0,0.397959,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32532,0.561644,0.094827,0.533333,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32533,0.068493,0.128499,0.533333,0.000000,0.0,0.193878,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [2]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc,accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


In [20]:
X_train_f1, X_test_f1, y_train_f1, y_test_f1=train_test_split(x_final,y, test_size=0.2,random_state=42,stratify=y)

In [None]:

# BASELINE Model  On the  final data 

instance_f3=LogisticRegression()
pre_model_f3=instance_f3.fit(X_train_f1, y_train_f1)
prediction_f3=pre_model_f3.predict(X_test_f1) 
accuracy_score(y_test_f1,prediction_f3)
report_f3=classification_report(y_test_f1,prediction_f3) # #86% ; f1(macro avg)--0.79 !! f1score(wgt avg)--0.85
print("Classification Report:")
print(report_f3)

#####################################################################################################################################

instance_f1=DecisionTreeClassifier()
pre_model_f1=instance_f1.fit(X_train_f1, y_train_f1)
prediction_f1=pre_model_f1.predict(X_test_f1)

accuracy_score(y_test_f1,prediction_f1) ##82%; f1(macro avg)--0.76 !! f1score(wgt avg)--0.82
report_f1=classification_report(y_test_f1,prediction_f1)
#print("Classification Report:")
print(report_f1)

##############################################################################################################################

instance_f2=KNeighborsClassifier()
pre_model_f2=instance_f2.fit(X_train_f1, y_train_f1)
prediction_f2=pre_model_f2.predict(X_test_f1)  

accuracy_score(y_test_f1,prediction_f2)           
report_f2=classification_report(y_test_f1,prediction_f2) # #83% ; f1(macro avg)--0.76!! f1score(wgt avg)--0.83
print("Classification Report:")
print(report_f2)

#####################################################################################################
svc1=SVC()
svc1.fit(X_train_f1,y_train_f1)
prediction_s=svc1.predict(X_test_f1)
     
accuracy_score(y_test_f1,prediction_s)     
report_s=classification_report(y_test_f1,prediction_s) # #84% ; f1(macro avg)--0.77 !! f1score(wgt avg)--0.84
print("Classification Report:")
print(report_s)
########################################################################################################



  y = column_or_1d(y, warn=True)


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      4939
           1       0.73      0.62      0.68      1568

    accuracy                           0.86      6507
   macro avg       0.81      0.78      0.79      6507
weighted avg       0.85      0.86      0.85      6507

              precision    recall  f1-score   support

           0       0.88      0.87      0.88      4939
           1       0.61      0.64      0.63      1568

    accuracy                           0.82      6507
   macro avg       0.75      0.75      0.75      6507
weighted avg       0.82      0.82      0.82      6507



  return self._fit(X, y)


Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      4939
           1       0.67      0.60      0.63      1568

    accuracy                           0.83      6507
   macro avg       0.77      0.75      0.76      6507
weighted avg       0.83      0.83      0.83      6507



  y = column_or_1d(y, warn=True)


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4939
           1       0.71      0.57      0.63      1568

    accuracy                           0.84      6507
   macro avg       0.79      0.75      0.77      6507
weighted avg       0.83      0.84      0.83      6507

              precision    recall  f1-score   support

           0       0.90      0.93      0.92      4939
           1       0.77      0.68      0.72      1568

    accuracy                           0.87      6507
   macro avg       0.83      0.81      0.82      6507
weighted avg       0.87      0.87      0.87      6507



In [21]:
#xgboostclassifier

instance_xg=XGBClassifier()
pre_model_xg=instance_xg.fit(X_train_f1, y_train_f1)
joblib.dump(pre_model_xg,'final_model')
prediction_xg=pre_model_xg.predict(X_test_f1)

#metrics 
accuracy_score(y_test_f1,prediction_xg)  #88%; f1(macro avg)--0.82 !! f1score(wgt avg)--0.88  #(1;recall(67%))
report_xg=classification_report(y_test_f1,prediction_xg)
#print("Classification Report:")
print(report_xg)


              precision    recall  f1-score   support

           0       0.90      0.93      0.92      4939
           1       0.77      0.68      0.72      1568

    accuracy                           0.87      6507
   macro avg       0.83      0.81      0.82      6507
weighted avg       0.87      0.87      0.87      6507



In [None]:
from catboost import CatBoostClassifier
pre_model_cb=CatBoostClassifier()
pre_model_cb=pre_model_cb.fit(X_train_f1, y_train_f1)
prediction_cb=pre_model_cb.predict(X_test_f1)

#metrics 
accuracy_score(y_test_f1,prediction_cb)  #87%; f1(macro avg)--0.82 !! f1score(wgt avg)--0.87  #(1;recall(67%))
report_cb=classification_report(y_test_f1,prediction_cb)
#print("Classification Report:")
print(report_cb)

Learning rate set to 0.041431
0:	learn: 0.6545916	total: 199ms	remaining: 3m 18s
1:	learn: 0.6184825	total: 217ms	remaining: 1m 48s
2:	learn: 0.5834168	total: 230ms	remaining: 1m 16s
3:	learn: 0.5525328	total: 242ms	remaining: 1m
4:	learn: 0.5264649	total: 256ms	remaining: 50.9s
5:	learn: 0.5033225	total: 268ms	remaining: 44.5s
6:	learn: 0.4834201	total: 284ms	remaining: 40.3s
7:	learn: 0.4661179	total: 297ms	remaining: 36.9s
8:	learn: 0.4515895	total: 315ms	remaining: 34.7s
9:	learn: 0.4364260	total: 329ms	remaining: 32.6s
10:	learn: 0.4245537	total: 342ms	remaining: 30.8s
11:	learn: 0.4145439	total: 356ms	remaining: 29.3s
12:	learn: 0.4051277	total: 371ms	remaining: 28.2s
13:	learn: 0.3959256	total: 386ms	remaining: 27.2s
14:	learn: 0.3885555	total: 400ms	remaining: 26.2s
15:	learn: 0.3819758	total: 414ms	remaining: 25.5s
16:	learn: 0.3760790	total: 436ms	remaining: 25.2s
17:	learn: 0.3702184	total: 450ms	remaining: 24.5s
18:	learn: 0.3652852	total: 465ms	remaining: 24s
19:	learn: 0.

In [None]:
svc2=SVC(kernel="poly",gamma=0.35,C=0.1)
svc2.fit(X_train_f1,y_train_f1)
prediction_svc2=svc2.predict(X_test_f1)
     
accuracy_score(y_test_f1,prediction_svc2)     

report_svc2=classification_report(y_test_f1,prediction_svc2) # #85% ; f1(macro avg)--0.77 !! f1score(wgt avg)--0.84--poly
print("Classification Report:")
print(report_svc2)

In [None]:
#hypertuning the logistic regression parameters
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from scipy.stats import uniform,randint
import numpy as np
param_dist={ 
            'C':uniform(0,1),
             'penalty':['l1','l2'],
             'solver':['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
             'random_state':[42]
            }

instance_opt_f3a=RandomizedSearchCV(LogisticRegression(n_jobs=-1),param_dist,cv=5)

#fit the model with randomization
instance_opt_f3a.fit(X_train_f1,y_train_f1)
print("best_parameters found :",instance_opt_f3a.best_params_)
print("best_scores found :",instance_opt_f3a.best_score_)


#################################################################################################
instance_=LogisticRegression(max_iter=1000,C= 0.33635647132487345,penalty= 'l1', solver= 'liblinear',n_jobs=-1,random_state=42) 
pre_model_=instance_.fit(X_train_f1, y_train_f1)
prediction_=pre_model_.predict(X_test_f1)

#metrics
accuracy_score(y_test_f1,prediction_)  #86%; f1(macro avg)--0.79 !! f1score(wgt avg)--0.85
report_=classification_report(y_test_f1,prediction_)
#print("Classification Report:")
print(report_)

In [None]:
#Accuracy represents the number of correctly classified data instances over the total number of data instances
#Precision is the number of correct positive results divided by the number of positive results predicted by the classifier.
#Recall gives a measure of how accurately our model can identify the relevant data.
# f1-Score is used to measure a test’s accuracy. (harmonic mean between precision and recall )

In [1]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

In [None]:
matrix=confusion_matrix(y_test_f1,prediction_xg)
sns.heatmap(matrix,annot=True,fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
matrix

In [None]:
#xgboostclassifier
from xgboost import XGBClassifier
instance_xg=XGBClassifier()
pre_model_xg=instance_xg.fit(X_train_f1, y_train_f1)
joblib.dump(pre_model_xg,'final_model1')

['final_model1']

In [22]:
final_model=joblib.load('final_model1')
predicted_values=final_model.predict(X_test_f1)
predicted_values

array([0, 1, 0, ..., 0, 1, 0])

In [None]:
X_test_f1

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,...,country_Portugal,country_Puerto-Rico,country_Scotland,country_South,country_Taiwan,country_Thailand,country_Trinadad&Tobago,country_United-States,country_Vietnam,country_Yugoslavia
20326,0.013699,0.098373,0.400000,0.0,0.0,0.091837,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23861,0.465753,0.254659,0.800000,0.0,0.0,0.500000,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1445,0.452055,0.012335,0.666667,0.0,0.0,0.448980,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25856,0.287671,0.224359,0.800000,0.0,0.0,0.704082,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5874,0.273973,0.211129,0.066667,0.0,0.0,0.857143,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13449,0.123288,0.320246,0.200000,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10321,0.315068,0.044194,0.933333,0.0,0.0,0.397959,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12239,0.136986,0.271434,0.800000,0.0,0.0,0.602041,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25805,0.301370,0.161032,0.866667,0.0,0.0,0.724490,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
