In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder 
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import uniform,stats,chi2_contingency
import joblib 


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from sklearn.model_selection import KFold,RandomizedSearchCV,GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:

data=pd.read_csv(r"C:\Users\Tippu\Downloads\adult_census\adult.csv")
data.sample(10)
data.info()
data2=data.copy()

data.drop(['education'],axis=1,inplace=True)
data.describe()

In [None]:
data['salary']=data['salary'].apply(lambda x:1 if x== ' >50K' else 0)
data['salary'].value_counts()

#defining the replacements in dictionary
#replacements={' >50K':1,' <=50K':0}
#data['salary']=data['salary'].replace(replacements)
# the output variable is unbalanced as mostly salary is below 50k more than 75%

In [None]:
fig=px.scatter(data,y='salary',x='hours-per-week')
fig.show()

#data[data['country']==' ?']
a=data['age'].value_counts()
fig=px.bar(a)
fig.show()
# slightly right skewed

In [None]:
#for col in num_data.columns:
   # fig=px.histogram(num_data[col])
   # fig.show()
    
    
#histogram of 'country','workclass','occupation'.
c=['country','workclass','occupation']
for col in c:
    fig=px.histogram(data[col])
    fig.show()
    
data['country'].value_counts()
data['country'].value_counts(normalize=True)

data['country'].unique()

#imputing the "?" with "other_nation" as it only accounts to 1.7% of data
#data['country'].replace(' ?','other_nation',inplace=True)
#data['country'].unique()
data['country'].value_counts(normalize=True) #1.7%

In [None]:
fig=px.histogram(data,'sex')
fig.show()
fig=px.histogram(data,'race')
fig.show()

fig=px.histogram(data,'relationship')
fig.show()
fig=px.histogram(data,'marital-status')
fig.show()
px.histogram(data['fnlwgt'],nbins=50).show()
data['fnlwgt'].max()

In [None]:
'''
relationship and marital statu( individual's life circumstances, )s both refer to aspects of an 
individual's personal life.employers may consider marital status when offering benefits such as health
insurance, whereas relationship status may not have the same legal implications but could still 
influence an individual's lifestyle and priorities.
individuals who are married or in long-term relationships may be perceived as more stable
and reliable, both partners contribute to the household income potentially higher living standards. 
work-life balance ,financial responsibilities may influence their career choices ,
Gender dynamics within relationships can also play a role in salary differences. 


Feature importance techniques, such as  permutation importance or model-specific feature importance, 
can help identify the influence of these variables on salary predictions.

 the effect of marital status on salary may differ based on gender, education level, or industry.
 Interaction terms can be included in the model to capture these complex relationships.
 : After training the predictive model, it's essential to interpret the results to understand
 how relationship status and marital status contribute to salary predictions. 
  Techniques such as partial dependence plots, SHAP (SHapley Additive exPlanations) values,or coefficient
  analysis in linear models can help interpret the impact of these variables on salary predictions
  
  
Accounting for Bias: Analyzing the influence of relationship status and marital status on salary 
predictions should consider potential biases. It's crucial to ensure that the predictive model does not 
perpetuate or amplify existing biases related to these variables. Techniques such as fairness-aware
machine learning or bias mitigation strategies can be applied to address bias in the predictive model.

maintaining the categorical nature of the variable
  
'''

In [None]:
'''  
The output of the fit_transform() method of the OneHotEncoder is a sparse matrix by default, 
unless you specify sparse=False. Sparse matrices are often used for efficiency when dealing 
with large datasets with many categories, as they only store non-zero values.
#when you create a DataFrame directly from the output of fit_transform(), it retains the
sparse matrix format, resulting in a DataFrame with a single column containing the sparse 
matrix objects.

#To convert the sparse matrix to a dense array and create a DataFrame with multiple columns
representing the one-hot encoded features, you can use the toarray() method of the sparse 
matrix. '''

In [None]:
# spliting input and output variables
x=data.drop('salary',axis=1)
y=data['salary']
x.info()

#spliting the numeric and categoric data
num_data=x.select_dtypes(exclude='object')
print(num_data.columns)

cat_data=x.select_dtypes(exclude='int64')
print(cat_data.columns)

num_data.sample(5)
cat_data[cat_data==' ?'].isnull().sum()
cat_data.sample(5)

data['country'].unique() # ,workclass,country,occupation use one leave out technique/target encoding  to encode the categ data
data['relationship'].unique()  # sex,race,marital-status(7),relationship(6)  1hot encoding

for col in cat_data.columns:
    c=cat_data[col].unique()
    print(c) 

In [None]:
data['occupation'].nunique() #15
data['relationship'].nunique() #6
data['marital-status'].nunique()#7
data['race'].nunique() #5
for col in cat_data.columns:
   a= data[col].nunique()
   print(f'{col} unique values :  {a}')

In [None]:
num_data[num_data==' ?'].isnull().sum() # no question mark is observed or null values
# QQ plot is a graphical tool to compare two probability distributions by plotting their quantiles
# against each other.It's commonly used to assess if two datasets come from populations with a similar distribution. 

for col in num_data.columns:
    stats.probplot(num_data[col],dist='norm',plot=plt)
    plt.title('QQ Plot - Normal Distribution')
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    plt.grid(True)
    plt.show() 
    
    
    # mostly the numeric data closer to  normally distributed

In [None]:
#BOX PLOT 
for col in num_data.columns:
    fig=px.box(num_data[col])
    fig.show()

In [None]:
# Initialise the OneHotEncoder instance
encoder_instance = OneHotEncoder(drop='first')

# here we also have to fit the instance so there wont face issue inetting feature names properly

# variables to be encoded 
encoding_variables = ['sex', 'race', 'marital-status', 'relationship']

# pipeline 
pipe = Pipeline(steps=[('onehot', encoder_instance.fit(x[encoding_variables]))])

# applying column transformer 
preprocessor = ColumnTransformer(transformers=[('cat_encoding1', pipe, encoding_variables)])

# fit the pipeline to the training data 
fit_preprocessor = preprocessor.fit(x[encoding_variables])

# dump the preprocessing model
#joblib.dump(fit_preprocessor, "1hot_processor")

# Transform the data
transformed_data = fit_preprocessor.transform(x[encoding_variables]).toarray()

# Get feature names
feature_names = encoder_instance.get_feature_names_out(encoding_variables)

# Create a DataFrame with the transformed data and feature names
encoded_data = pd.DataFrame(transformed_data, columns=feature_names)
encoded_data 

# concating the encoded data
enc_data=pd.concat([cat_data,encoded_data],axis=1)
enc_data.drop(['sex', 'race', 'marital-status', 'relationship'],axis=1,inplace=True)
enc_data

In [None]:
# scale the numeric data 
scale_instance=MinMaxScaler()
scale_pipeline=Pipeline(steps=[('scale_num',scale_instance)])
scaled_fit_model=scale_pipeline.fit(num_data)
#save the pipeline
#joblib.dump(scaled_fit_model,'scale_num')

# transform the numeric data 
scaled_data=pd.DataFrame(scaled_fit_model.transform(num_data),columns=num_data.columns)
scaled_data.describe()

In [None]:
#concatinate the encoded and scaled data leaving workclass, occupation and country 
a1=pd.concat([scaled_data,enc_data,y],axis=1)
px.histogram(a1['country']).show()

In [None]:
#chisquared test -- to check the association between categorical features.if p<0.05 it indicates the features are associated
#tetrachoric correlation-- for binary categorical variables 
#polychoric corr--for ordinal categorical variables
#
contingency_table=pd.crosstab(a1['occupation'],a1['workclass'])

chi2,p,dof,expected=chi2_contingency(contingency_table)
print("chi_squared statistic,pvalue",chi2,p)


#as pvalue=0 , it suggests strong evidence against the null hypothesis of independence 
contingency_table=pd.crosstab(a1['occupation'],a1['country'])
chi2,p,dof,expected=chi2_contingency(contingency_table)
print("chi_squared statistic,pvalue",chi2,p)

contingency_table=pd.crosstab(a1['country'],a1['workclass'])
chi2,p,dof,expected=chi2_contingency(contingency_table)
print("chi_squared statistic,pvalue",chi2,p)

# thus implies country,occupation,workclass have a strong association with one other 

In [None]:
# getting the indexes of the '?' rows in the dataset
d_indexes1=a1[a1['workclass']==' ?'].index 
d_indexes2=a1[a1['occupation']==' ?'].index
d_indexes3=a1[a1['country']==' ?'].index
d=d_indexes1.append([d_indexes2,d_indexes3])

#data with '?' variable data  
data_qc=a1.loc[d]
data_qc.drop_duplicates(inplace=True)
data_qc.duplicated().sum()

#without '?" data
data_wc=a1.drop(d)

# target encode the workclass  without question data 
num_data.var()

In [None]:
a3=data_wc[['workclass','occupation','country']]
a2=data_wc.drop(a3,axis=1)
a2  #encoded data without the country,occupation ,workclass

In [None]:
# Split dataset
X_train2, X_test2, y_train2, y_test2 = train_test_split(a2, a3['country'], test_size=0.2,random_state=42,stratify=a3['occupation'])

In [None]:
#approach 1

instance4=DecisionTreeClassifier()
#criterion='gini',splitter='best',random_state=42
pre_model4=instance4.fit(X_train2, y_train2)
prediction4=pre_model4.predict(X_test2)

prediction4
accuracy_score(y_test2,prediction4) #86%
 
report4=classification_report(y_test2,prediction4)
#print("Classification Report:")
print(report4)



param_dist={'criterion':['gini', 'entropy', 'log_loss'],
            'splitter':['best', 'random'],
            'min_samples_split':[2,5,7,11,15,25,50],
            'min_samples_leaf':[1,3,5,7,10,15],
            'max_features':['int', 'float', 'sqrt', 'log2'],
            'random_state':[0,21,42,61,100,250]
            }

instance_opt4=GridSearchCV(DecisionTreeClassifier(),param_dist,cv=5)

#fit the model with randomization
instance_opt4.fit(X_train2,y_train2)


print("best_scores found :",instance_opt4.best_score_)
print("best_parameters found :",instance_opt4.best_params_)
#eatures': 'sqrt', 'min_samples_leaf': 15, 'min_samples_split': 2, 'random_state': 61, 'splitter': 'best'

instance5=DecisionTreeClassifier(criterion= 'gini', max_features= 'sqrt', min_samples_leaf= 15, min_samples_split= 2,random_state=61,splitter= 'best')
#criterion='gini',splitter='best',random_state=42
pre_model5=instance5.fit(X_train2, y_train2)
prediction5=pre_model4.predict(X_test2)

prediction5
accuracy_score(y_test2,prediction5) #86%
 
report5=classification_report(y_test2,prediction5)
#print("Classification Report:")
print(report5)


print("best_parameters found :",instance_opt4.best_params_)


instance4=DecisionTreeClassifier(criterion= 'gini', max_features= 'log2', min_samples_leaf= 10, min_samples_split= 5, splitter= 'best',random_state=0)
#criterion='gini',splitter='best',random_state=42
pre_model4=instance4.fit(X_train2, y_train2)
prediction4=pre_model4.predict(X_test2)

prediction4
accuracy_score(y_test2,prediction4) #86%
 
report4=classification_report(y_test2,prediction4)
#print("Classification Report:")
print(report4)

# both give similar results though very much lack in macro avg scores  with 92% accuracy


In [None]:
## Create KNN classifier
k =5 # Number of neighbors
knn_classifier_final = KNeighborsClassifier(weights='distance',n_neighbors=k,n_jobs=-1,p=2)
# Train the classifier
knn_classifier_final.fit(X_train2, y_train2)
# Make predictions on the test set
prediction2 = knn_classifier_final.predict(X_test2)
# Evaluate accuracy
accuracy = accuracy_score(y_test2, prediction2)
print("Accuracy:", accuracy) #91.32%  ; #f1score: macro-avg:0.08 ; wgted avg:0.89
report1=classification_report(y_test2,prediction2)
print(report1) 

In [None]:
#predicting the ? data in country variable 
d1=data_qc[data_qc['country']==' ?'] #582 values missing 
d2=d1.drop(['workclass','occupation','country'],axis=1)

#predicting the  missing values of country 
a=pd.DataFrame()
a['country']=knn_classifier_final.predict(d2)
a.set_index(d1.index)

#replace '?' with calculated values 
data_qc.loc[data_qc['country']==' ?','country']=a.values
# It uses boolean indexing to identify and replace the missing value.

In [None]:
'''
a1['country'].unique()
note :
    
data_qc.loc[data_qc['country']==' ?']=a.values   #a.values is ndarray type 

#ValueError: Must have equal len keys and value when setting with an ndarray with misiing column name after boolean mask
# suggests that the length of the keys (indices) and the length of the values
# (array from a.values) do not match when you're trying to replace values in data_qc.


#When you use .loc[] to select rows in a DataFrame, you need to ensure that 
# the replacement values have the same length and alignment as the selected 
# rows. '''

In [None]:
'''
checking the country column:
d3=data_qc[data_qc['country']!=' ?']
d3_y=pd.DataFrame()
d3_y['country']=d3['country']
s2=d3.drop(['workclass','occupation','country'],axis=1)
s2_predicted=knn_classifier_final.predict(s2)
accuracy = accuracy_score(d3_y['country'], s2_predicted)
print("Accuracy:", accuracy) #91.79%
report1=classification_report(d3_y['country'], s2_predicted)
print(report1) 

#performance is good for the major class and it is underperformed for less record class which is not good
'''


In [None]:
# Split dataset

instance3=LogisticRegression()

pre_model3=instance3.fit(X_train2, y_train2)
prediction3=pre_model3.predict(X_test2)
accuracy_score(y_test2,prediction3) #92%

report3=classification_report(y_test2,prediction3)
#print("Classification Report:")
print(report3)

param_dist={'penalty':['l1','l2','elasticnet'],
            'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
             'max_iter':[50,100,250,500,1000],
            }

#n_iter--it determines how many different combinations of hyperparameters will be tried.
#max_iter in param_dist represents the maximum number of iterations taken for the solvers to converge.
instance_opt3=GridSearchCV(LogisticRegression(n_jobs=-1),param_dist,cv=5)
#fit the model with randomization
instance_opt3.fit(X_train2,y_train2)
  #91.45%

print("best_parameters found :",instance_opt3.best_params_)
print("best_scores found :",instance_opt3.best_score_)
#best_parameters found : {'C': 0.8751328233611145, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'

## Evaluate the model on the test set
print("Test set score with best parameters: ",instance_opt3.score(X_test2, y_test2))

instance3=LogisticRegression( max_iter= 50, penalty= 'l1', solver= 'saga',n_jobs=-1)
#C= 0.8751328233611145, max_iter= 100, penalty= 'l2', solver= 'saga',n_jobs=-1
#'max_iter': 50, 'penalty': 'l1', 'solver': 'saga'
pre_model3=instance3.fit(X_train2, y_train2)
prediction3=pre_model3.predict(X_test2)

accuracy_score(y_test2,prediction3) #92%
report3=classification_report(y_test2,prediction2)
#print("Classification Report:")
print(report3)

In [None]:
#Target encodng on country variable in data_Wc
kf=KFold(n_splits=5,shuffle=True,random_state=42)
encoded_fold_values=[]
#initialise the target encoder
target_encoder=TargetEncoder(cols=['country'],smoothing=0.1)
for train_index,test_index in kf.split(data_wc):
    
    train_data,val_data=data_wc.iloc[train_index],data_wc.iloc[test_index]
    target_encoder.fit(train_data['country'],train_data['salary'])
    
    #apply target encoder on test data using trained encoder 
    val_encoded=target_encoder.transform(val_data['country'])
    
    # Handle unseen categories by filling with global mean
    global_mean = train_data['salary'].mean()
    val_encoded.fillna(global_mean, inplace=True)
    
    # store encoded values
    encoded_fold_values.append(val_encoded)
    # concatenate the encoded values from each fold into a single DataFrame 
all_encoded_values=pd.concat(encoded_fold_values)

final_scores=all_encoded_values.groupby(by='country',level=-1).mean()
final_scores
#The encoding process is properly isolated within each fold of the cross-validation, preventing data leakage.
# Unseen categories in the validation data are handled appropriately by filling them with the global mean of the target variable.
# The mean encoded value for each category is calculated across all folds, providing a robust estimate of the category's encoded value.

# encoded data without workclass and occupation of data_wc
hnd_data=pd.concat([a2,final_scores],axis=1)
hnd_data

In [None]:
#To handle rare or unseen categories, smoothing techniques are often applied in target encoding
#target encoding implementation and ensure that it is properly isolated within each fold of the cross-validation process. 
# Additionally, you may need to adjust parameters such as smoothing factor or regularization strength to control the number of unique encoded values generated.

''' k-nearest neighbors (KNN) imputation or predictive modeling.encode them separately from other categories
Apply target encoding with smoothing techniques to handle potential overfitting and reduce the impact of rare categories.
Smoothing techniques like Laplace smoothing (additive smoothing) or James-Stein estimator
can help mitigate the risk of overfitting by adding a small value to the frequency counts of each category.
one-hot encoding, ordinal encoding, or binary encoding.'''


In [None]:
#working on workclass data to handle the ' ?' missing values. 
# Split dataset
X_traina, X_testa, y_traina, y_testa = train_test_split(hnd_data, a3['workclass'], test_size=0.2,random_state=42,stratify=a3['workclass'])

In [None]:

                    # baseline model


instance1=LogisticRegression()  #74% # macro avg(f1score)-0.13 !! wgt avg(f1score)-0.63
pre_model=instance1.fit(X_traina, y_traina)
prediction=pre_model.predict(X_testa)

#metrics
accuracy_score(y_testa,prediction) 
report_a=classification_report(y_testa,prediction)
#print("Classification Report:")
print(report_a)

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

#########################finding the best parameters ########################################

param_dist={'penalty':['l1','l2','elasticnet'],
            'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
             'max_iter':[50,100,250,500,1000,2500]
            }

instance_opta=GridSearchCV(LogisticRegression(random_state=42,n_jobs=-1),param_dist,cv=5)

#fit the model with randomization
instance_opta.fit(X_traina,y_traina)
     

print("best_parameters found :",instance_opta.best_params_)
print("best_scores found :",instance_opta.best_score_)
#best_parameters found : {'max_iter': 50, 'penalty': 'l2', 'solver': 'liblinear'}
# best_scores found : 0.738447536981248

## Evaluate the model on the test set
print("Test set score with best parameters: ",instance_opta.score(X_testa, y_testa))

#########################based on parameters we got from gridsearch############################

instance1=LogisticRegression(max_iter= 50, penalty= 'l2', solver= 'liblinear',n_jobs=-1,random_state=42)  #74%
pre_model=instance1.fit(X_traina, y_traina)
prediction=pre_model.predict(X_testa)

#metrics
accuracy_score(y_testa,prediction)  #74%; 4 values present and other 3  zero scores;f1(macro avg)--0.13 !! f1score(wgt avg)--0.63
report_a=classification_report(y_testa,prediction)
#print("Classification Report:")
print(report_a)

In [None]:

param_dist={'criterion':['gini', 'entropy', 'log_loss'],
            'splitter':['best', 'random'],
            'min_samples_split':[2,5,7,11,15,25,50],
            'min_samples_leaf':[1,3,5,7,10,15],
            'max_features':['int', 'float', 'sqrt', 'log2'],
            'random_state':[0,21,42,61,100,250]
            }

instance_optb=GridSearchCV(DecisionTreeClassifier(),param_dist,cv=5)

#fit the model with randomization
instance_optb.fit(X_traina,y_traina)


print("best_scores found :",instance_optb.best_score_)
print("best_parameters found :",instance_optb.best_params_)
#eatures': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 50, 'random_state': 100, 'splitter': 'random'  #74%;

In [None]:
#trail1
print('trail1')
instanceb=DecisionTreeClassifier(criterion= 'gini', max_features= 'sqrt', min_samples_leaf= 10, min_samples_split= 2,random_state=42,splitter= 'best')

pre_modelb=instanceb.fit(X_traina, y_traina)
predictionb=pre_modelb.predict(X_testa)

predictionb
accuracy_score(y_testa,predictionb) ##73%; 6 values present and 1 has zero scores;f1(macro avg)--0.17 !! f1score(wgt avg)--0.65
 
reportb=classification_report(y_testa,predictionb)
#print("Classification Report:")
print(reportb)

#trail2
#print('trail2')
#instanceb=DecisionTreeClassifier(criterion= 'gini', max_features= 'sqrt', min_samples_leaf= 10, min_samples_split= 50,random_state=100,splitter= 'random')
#pre_modelb=instanceb.fit(X_traina, y_traina)
#predictionb=pre_modelb.predict(X_testa)

#accuracy_score(y_testa,predictionb) #73%; 4 values present and other 3  zero scores;f1(macro avg)--0.12 !! f1score(wgt avg)--0.63
 
#reportb=classification_report(y_testa,predictionb)
#print("Classification Report:")
#print(reportb)

# trail1 shows good results comparitively


In [None]:

# Split dataset

instancec=KNeighborsClassifier()  #71%
pre_modelc=instancec.fit(X_traina, y_traina)
predictionc=pre_modelc.predict(X_testa)

#metrics
accuracy_score(y_testa,predictionc) 
reportc=classification_report(y_testa,predictionc)
#print("Classification Report:")
print(reportc)
#74%--f1score(macroavg)--0.18 ; wgtavg(f1score)--0.65


#set zero_division parameter

#########################################################################################################################################################

#model improve to increase its efficiency

param_dist={'weights':['distance','uniform'],
            'n_neighbors':[5,10,15,25],
            'p':[1,2]
            }
instance_optc=GridSearchCV(KNeighborsClassifier(n_jobs=-1),param_dist,cv=5)
#fit the model with randomization
instance_optc.fit(X_traina,y_traina)
print("best_parameters found :",instance_optc.best_params_)
print("best_scores found :",instance_optc.best_score_)

# Evaluate the model on the test set
print("Test set score with best parameters: ",instance_optc.score(X_testa, y_testa))


# best parameters:{'n_neighbors': 25, 'p': 1, 'weights': 'uniform'} #74%

#########################################################################################################################################################
k =25 # Number of neighbors
knn_classifier = KNeighborsClassifier(weights='uniform',n_neighbors=k,p=1,n_jobs=-1)

# Train the classifier
knn_classifier.fit(X_traina, y_traina)

# Make predictions on the test set
predictionc = knn_classifier.predict(X_testa)

# Evaluate accuracy
accuracy_C = accuracy_score(y_testa, predictionc)
print("Accuracy:", accuracy_C)  #74%; 3 values present and other mostly zero scores;f1(macro avg)--0.14 !! f1score(wgt avg)--0.64

reportc=classification_report(y_testa,predictionc)
print(reportc)

In [None]:
instanceb_svc=SVC(kernel='poly',gamma=0.1,C=0.05)
pre_model_Svc=instanceb_svc.fit(X_traina, y_traina)
prediction_svc=pre_model_Svc.predict(X_testa)

accuracy_score(y_testa,prediction_svc) ##74%; 6 values  are not present and 1 has non-zero scores;f1(macro avg)--0.12 !! f1score(wgt avg)--0.63
 
report_svc=classification_report(y_testa,prediction_svc)
#print("Classification Report:")
print(report_svc)

In [None]:
#target enccode the country column in data_qc and predict workclass variable

kf1=KFold(n_splits=5,shuffle=True,random_state=42)

encoded_fold_values1=[]
#initialise the target encoder
target_encoder1=TargetEncoder(cols=['country'],smoothing=0.1)


for train_index,test_index in kf.split(data_qc):
    
    train_data1,val_data1=data_qc.iloc[train_index],data_qc.iloc[test_index]
    
    target_encoder1.fit(train_data1['country'],train_data1['salary'])
    
    #apply target encoder on test data using trained encoder 
    
    val_encoded1=target_encoder1.transform(val_data1['country'])
    
    # Handle unseen categories by filling with global mean
    global_mean1 = train_data1['salary'].mean()
    val_encoded1.fillna(global_mean, inplace=True)
    
    # store encoded values
    encoded_fold_values1.append(val_encoded1)
    # concatenate the encoded values from each fold into a single DataFrame 
all_encoded_values1=pd.concat(encoded_fold_values1)
final_scores1=all_encoded_values1.groupby(by='country',level=-1).mean()
final_scores1

In [None]:
#concating the encoded values with the data to make prediction
d_qc=data_qc.drop(['country','occupation','workclass'],axis=1)
data_qc1=pd.concat([d_qc,final_scores1],axis=1)


w_indexes=data_qc[data_qc['workclass']==' ?'].index
pred_data=data_qc1.loc[w_indexes]
pred_data

#predicting the worclass missing values
predict_workclass=pd.DataFrame()
predict_workclass['workclass']=pre_modelb.predict(pred_data)
predict_workclass

#replacing the  '?' with predicted values
data_qc.loc[data_qc['workclass']==' ?','workclass']=predict_workclass.values

data_qc

In [None]:
#target encode the workclass variable and model the data 

kf1=KFold(n_splits=5,shuffle=True,random_state=42)

encoded_fold_values2=[]

#initialise the target encoder
target_encoder2=TargetEncoder(cols=['workclass'],smoothing=0.1)


for train_index,test_index in kf.split(data_wc):
    
    train_data2,val_data2=data_wc.iloc[train_index],data_wc.iloc[test_index]
    
    target_encoder2.fit(train_data2['workclass'],train_data2['salary'])
    
    #apply target encoder on test data using trained encoder 
    
    val_encoded2=target_encoder2.transform(val_data2['workclass'])
    
    # Handle unseen categories by filling with global mean
    global_mean2= train_data2['salary'].mean()
    val_encoded2.fillna(global_mean, inplace=True)
    
    # store encoded values
    encoded_fold_values2.append(val_encoded2)
    # concatenate the encoded values from each fold into a single DataFrame 
all_encoded_values2=pd.concat(encoded_fold_values2)
final_scores2=all_encoded_values2.groupby(by='workclass',level=-1).mean()
final_scores2
#########################################################################################################################################
#concatinated the workclass encoded variable values to hnd_data
hnd_data2=pd.concat([hnd_data,final_scores2],axis=1)
hnd_data2

In [None]:
# Split dataset
X_train2a, X_test2a, y_train2a, y_test2a = train_test_split(hnd_data2, a3['occupation'], test_size=0.2,random_state=42,stratify=a3['occupation'])

In [None]:
 # baseline model
#creating instance and training the data 
instance_2a=LogisticRegression()  #32.26% # macro avg(f1score)-0.16 !! wgt avg(f1score)-0.26--- 3/14(zeros)
pre_model2a=instance_2a.fit(X_train2a, y_train2a)
prediction2a=pre_model2a.predict(X_test2a)

#metrics
accuracy_score(y_test2a,prediction2a) 
report_2a=classification_report(y_test2a,prediction2a)
print(report_2a)


instance_2b=KNeighborsClassifier()  #27% # macro avg(f1score)-0.18 !! wgt avg(f1score)-0.25--- 1/14(zeros)
pre_model2b=instance_2b.fit(X_train2a, y_train2a)
prediction2b=pre_model2b.predict(X_test2a)

#metrics
accuracy_score(y_test2a,prediction2b) 
report_2b=classification_report(y_test2a,prediction2b)
print(report_2b)


instance_2c=DecisionTreeClassifier()  #23% # macro avg(f1score)-0.18 !! wgt avg(f1score)-0.24--- 1/14(zeros)
pre_model2c=instance_2c.fit(X_train2a, y_train2a)
prediction2c=pre_model2c.predict(X_test2a)

#metrics
accuracy_score(y_test2a,prediction2c) 
report_2c=classification_report(y_test2a,prediction2c)
print(report_2c)


In [None]:
param_dist={'penalty':['l1','l2','elasticnet'],
            'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
             'max_iter':[50,100,250,500,1000,2500]
            }

instance_opt2a=GridSearchCV(LogisticRegression(random_state=42,n_jobs=-1),param_dist,cv=5)

#fit the model with randomization
instance_opt2a.fit(X_train2a,y_train2a)
     

print("best_parameters found :",instance_opt2a.best_params_)
print("best_scores found :",instance_opt2a.best_score_)
#best_parameters found : {'max_iter': 50, 'penalty': 'l2', 'solver': 'liblinear'}
# best_scores found : 0.738447536981248

## Evaluate the model on the test set
print("Test set score with best parameters: ",instance_opt2a.score(X_test2a, y_test2a))

In [None]:
# best parameters: 'max_iter': 50, 'penalty': 'l2', 'solver': 'lbfgs' #32.32%
instance_2a=LogisticRegression(max_iter= 50, penalty= 'l2', solver= 'lbfgs')  #32.26% # macro avg(f1score)-0.16 !! wgt avg(f1score)-0.26(improved)--- 4/14(zeros)
pre_model2a=instance_2a.fit(X_train2a, y_train2a)
prediction2a=pre_model2a.predict(X_test2a)   # precision, recall values for some levels have improved significantly than base model 

#metrics
accuracy_score(y_test2a,prediction2a) 
report_2a=classification_report(y_test2a,prediction2a)
#print("Classification Report:")
print(report_2a)


In [None]:
param_dist={'criterion':['gini', 'entropy', 'log_loss'],
            'splitter':['best', 'random'],
            'min_samples_split':[2,5,7,11,15,25,50],
            'min_samples_leaf':[1,3,5,7,10,15],
            'max_features':['int', 'float', 'sqrt', 'log2'],
            'random_state':[0,21,42,61,100,250]
            }

instance_opt2b=GridSearchCV(DecisionTreeClassifier(),param_dist,cv=5)

#fit the model with randomization
instance_opt2b.fit(X_train2a,y_train2a)

print("best_scores found :",instance_opt2b.best_score_)
print("best_parameters found :",instance_opt2b.best_params_)
#best_scores found : 0.3144351172083855
# best_parameters found : {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 25, 'random_state': 61, 'splitter': 'random'}
###########################################################################################################################



In [None]:
instance_2c=DecisionTreeClassifier(criterion= 'gini', max_features= 'sqrt', min_samples_leaf= 10, min_samples_split= 25, random_state= 61, splitter= 'random')  
pre_model2c=instance_2c.fit(X_train2a, y_train2a)
prediction2c=pre_model2c.predict(X_test2a)    #31% # macro avg(f1score)-0.17 !! wgt avg(f1score)-0.27--- 1/14(zeros)

#metrics
accuracy_score(y_test2a,prediction2c) 
report_2c=classification_report(y_test2a,prediction2c)
#print("Classification Report:")
print(report_2c)

In [None]:
param_dist={'weights':['distance','uniform'],
            'n_neighbors':[5,10,15,25],
            'p':[1,2]  
            }
instance_opt2c=GridSearchCV(KNeighborsClassifier(n_jobs=-1),param_dist,cv=5)
#fit the model with randomization
instance_opt2c.fit(X_train2a,y_train2a)
print("best_parameters found :",instance_opt2c.best_params_)
print("best_scores found :",instance_opt2c.best_score_)

# Evaluate the model on the test set
print("Test set score with best parameters: ",instance_opt2c.score(X_test2a, y_test2a))
#######################################################################################################################################################
instance_2bb=KNeighborsClassifier(n_neighbors= 25, p= 1, weights= 'uniform')  #27% # macro avg(f1score)-0.18 !! wgt avg(f1score)-0.25--- 1/14(zeros)
pre_model2bb=instance_2b.fit(X_train2a, y_train2a)
prediction2bb=pre_model2b.predict(X_test2a)

#metrics
accuracy_score(y_test2a,prediction2bb) 
report_2bb=classification_report(y_test2a,prediction2bb)
#print("Classification Report:")
print(report_2bb)

In [None]:

instance_svc=SVC(kernel='poly',gamma=0.45,C=0.35)  #33% # macro avg(f1score)-0.16 !! wgt avg(f1score)-0.26--- 4/14(zeros) 
instance_svc.fit(X_train2a, y_train2a)
prediction_svc=instance_svc.predict(X_test2a)

#metrics
accuracy_score(y_test2a,prediction_svc) 
report_svc=classification_report(y_test2a,prediction_svc)
#print("Classification Report:")
print(report_svc)

In [None]:
#Label encoding the occupation column
from sklearn.preprocessing import LabelEncoder
label_encode=LabelEncoder()

a3['occ_encoded']=label_encode.fit_transform(a3['occupation'])  #it  was performed as the o/p for training is expected to be numeric in xgboost
a3['occ_encoded']
#a3['occupation'].nunique() #14

#display the name and encoded value
for name,label in zip(a3['occupation'],a3['occ_encoded']):
    print(f"{name} -> {label}")


In [None]:
# Split dataset
X_train2aa, X_test2aa, y_train2aa, y_test2aa = train_test_split(hnd_data2, a3['occ_encoded'], test_size=0.2,random_state=42,stratify=a3['occ_encoded'])
#xgboostclassifier


instance_xaa=XGBClassifier()
pre_model_xaa=instance_xaa.fit(X_train2aa, y_train2aa)
prediction_xaa=pre_model_xaa.predict(X_test2aa)

#metrics
accuracy_score(y_test2aa,prediction_xaa)  #34%; f1(macro avg)--0.25 !! f1score(wgt avg)--0.32 # 1/14
report_xaa=classification_report(y_test2aa,prediction_xaa)
#print("Classification Report:")
print(report_xaa)

In [None]:
#target encode the workclass variable and model the data 

kf1=KFold(n_splits=5,shuffle=True,random_state=42)
encoded_fold_values3=[]

#initialise the target encoder
target_encoder3=TargetEncoder(cols=['workclass'],smoothing=0.1)

for train_index,test_index in kf.split(data_qc):
    
    train_data3,val_data3=data_qc.iloc[train_index],data_qc.iloc[test_index]
    
    target_encoder3.fit(train_data3['workclass'],train_data3['salary'])
    
    #apply target encoder on test data using trained encoder 
    
    val_encoded3=target_encoder3.transform(val_data3['workclass'])
    
    # Handle unseen categories by filling with global mean
    global_mean3= train_data3['salary'].mean()
    val_encoded3.fillna(global_mean3, inplace=True)
    
    # store encoded values
    encoded_fold_values3.append(val_encoded3)
    # concatenate the encoded values from each fold into a single DataFrame 
all_encoded_values3=pd.concat(encoded_fold_values3)
final_scores3=all_encoded_values3.groupby(by='workclass',level=-1).mean()
final_scores3

#concating the encoded values with the data to make prediction
data_qc2=pd.concat([data_qc1,final_scores3],axis=1)

#indexes and the ' ?' data for predicting the occupation variable 
w_indexes=data_qc[data_qc['occupation']==' ?'].index
pred_data3=data_qc2.loc[w_indexes]
pred_data3

In [None]:
#predicting the worclass missing values
predict_occ=pd.DataFrame()
predict_occ['occupation']=pre_model_xaa.predict(pred_data3)

#inversing the label encoder values into its origianl form
predict_occ['occupation']=label_encode.inverse_transform(predict_occ)
predict_occ

In [None]:
#replacing the  '?' with predicted values
data_qc.loc[data_qc['occupation']==' ?','occupation']=predict_occ.values

# concate the missing data with its other part
data_final=pd.concat([data_wc,data_qc],axis=0)
data_final

In [None]:
# concate the missing data with its other part
data_f=pd.concat([data_wc,data_qc],axis=0)
data_f

In [None]:
data_f1=data_f[['workclass','occupation','country']]
data1=data.copy()

#removing missing values columns in the data1
data1_=data1.drop(['workclass','occupation','country'],axis=1)

In [None]:

#concating the handled missing value data with original dataset
data_final=pd.concat([data1_,data_f1],axis=1)



In [None]:
data_final[data_final==' ?'].sum()

In [None]:
#dump the handled adult census data 

joblib.dump(data_final,'modified_adult_data')