In [72]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [73]:
df=pd.read_csv('adult_dataset.csv')

print(df.head(5))
df.columns


   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [74]:
# a. Data cleaning(Remove NA, ?, Negative values etc.)  

df.replace('?',pd.NA,inplace=True)

df.dropna(inplace=True)


In [75]:
# b. Error correcting(Outlier detection and removal)  

num_cols=df.select_dtypes(include=['int64','float64']).columns

for col in num_cols:
    low=df[col].quantile(0.01)
    high=df[col].quantile(0.99)
    df=df[(df[col]>=low) & (df[col]<=high)]

In [76]:
# c. Data Transformation

scaler=StandardScaler()
df[num_cols]=scaler.fit_transform(df[num_cols])

from sklearn.preprocessing import LabelEncoder

# Label encode all object (string) columns
label = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    if col != 'income':
        df[col] = label.fit_transform(df[col])


In [79]:
# d. Build Data model using regression and Naïve Bayes methods for prediction of income category (>=50k or <=50k) and compare accuracy Prediction. 

X=df.drop('income',axis=1)
y=df['income'].apply(lambda x:1 if x=='>50K' else 0)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)


# Logistic Regression
lr=LogisticRegression(max_iter=1000)
lr.fit(X_train,y_train)
lr_pred=lr.predict(X_test)

# Naive Bayes
nb=GaussianNB()
nb.fit(X_train,y_train)
nb_pred=nb.predict(X_test)

print('Accuracy of Logistic Regression:', accuracy_score(y_test,lr_pred))
print('Accuracy of Naive Bayes:', accuracy_score(y_test,nb_pred))



Accuracy of Logistic Regression: 0.819268552491419
Accuracy of Naive Bayes: 0.8057758314593443
