In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [74]:
data = pd.read_csv('adult_dataset.csv')
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'Target']
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [75]:
data.shape

(48842, 15)

In [76]:
data.isna().sum().sum()

np.int64(0)

In [77]:
(data.values == '?').sum()

np.int64(6465)

In [78]:
data = data.replace('?', np.nan)
data.dropna(inplace=True)

In [79]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Target            0
dtype: int64

In [80]:
data.shape

(45222, 15)

In [81]:
# data['Target'].unique()
data['marital-status'].unique()
# data['sex'].unique()

array(['Never-married', 'Married-civ-spouse', 'Widowed', 'Separated',
       'Divorced', 'Married-spouse-absent', 'Married-AF-spouse'],
      dtype=object)

In [82]:
data = data.replace(' <=50K', 1)
data = data.replace(' >50K', 2)

data = data.replace(' Male', 1)
data = data.replace(' Female', 2)

In [83]:
data['Target'].unique()

array(['<=50K', '>50K'], dtype=object)

In [84]:
data_t = data[['age', 'fnlwgt', 'sex', 'hours-per-week', 'Target']]
data_t.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,25,226802,Male,40,<=50K
1,38,89814,Male,50,<=50K
2,28,336951,Male,40,>50K
3,44,160323,Male,40,>50K
5,34,198693,Male,30,<=50K


## **Error correcting (Outlier Detection and Removal)**



In [85]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for column in data_t.columns:
    if data_t[column].dtype == 'object':
        data_t[column] = le.fit_transform(data_t[column])

data_t = data_t.astype(float)
data_t.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_t[column] = le.fit_transform(data_t[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_t[column] = le.fit_transform(data_t[column])


(45222, 5)

In [86]:
def remove_outliers_zscore(data_t, threshold=3):
  zscore = np.abs((data_t - data_t.mean()) / data_t.std())
  out = zscore > 3
  data_t = data_t[~out.any(axis=1)]

  return data_t

filtered_data = remove_outliers_zscore(data_t)

In [87]:
filtered_data.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,25.0,226802.0,1.0,40.0,0.0
1,38.0,89814.0,1.0,50.0,0.0
2,28.0,336951.0,1.0,40.0,1.0
3,44.0,160323.0,1.0,40.0,1.0
5,34.0,198693.0,1.0,30.0,0.0


In [88]:
filtered_data.shape

(43978, 5)

In [89]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [100]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Replace numeric values with strings
filtered_data.loc[filtered_data['Target'] == 1, 'Target'] = 'Y'
filtered_data.loc[filtered_data['Target'] == 2, 'Target'] = 'N'

# ✅ Step 2: Force all values in 'Target' to be strings
filtered_data['Target'] = filtered_data['Target'].astype(str)

# Step 3: Apply LabelEncoder
le = LabelEncoder()
filtered_data['Target'] = le.fit_transform(filtered_data['Target'])

# Step 4: View the classes
print(le.classes_)  # Output should be ['N' 'Y']


['0' 'Y']


  filtered_data.loc[filtered_data['Target'] == 1, 'Target'] = 'Y'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Target'] = filtered_data['Target'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Target'] = le.fit_transform(filtered_data['Target'])


In [101]:
filtered_data['Target'] = le.fit_transform(filtered_data['Target'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Target'] = le.fit_transform(filtered_data['Target'])


In [92]:
filtered_data.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,25.0,226802.0,1.0,40.0,0
1,38.0,89814.0,1.0,50.0,0
2,28.0,336951.0,1.0,40.0,1
3,44.0,160323.0,1.0,40.0,1
5,34.0,198693.0,1.0,30.0,0


In [93]:
filtered_data['Target'].unique()
data1 = filtered_data

In [94]:
X = data1.drop('Target', axis=1)
y = data1['Target']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [95]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [96]:
y_pred_reg = reg.predict(x_test)
print('Accuracy - Logistic Regression : ', accuracy_score(y_test, y_pred_reg))

Accuracy - Logistic Regression :  0.7514021524935577


In [97]:
navi = GaussianNB()
navi.fit(x_train, y_train)

In [98]:
y_pred_navi = navi.predict(x_test)
print('Accuracy - Naive Bayes : ', accuracy_score(y_test, y_pred_navi))

Accuracy - Naive Bayes :  0.7590571471881158


In [99]:

print('Accuracy - Logistic Regression : ', accuracy_score(y_test, y_pred_reg))
print('Accuracy - Naive Bayes : ', accuracy_score(y_test, y_pred_navi))

Accuracy - Logistic Regression :  0.7514021524935577
Accuracy - Naive Bayes :  0.7590571471881158
