In [1]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("adult.csv",header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income_category']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income_category
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


**Data cleaning(Remove NA, ?, Negative values etc.)**

In [7]:
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
education-num      0
marital-status     0
occupation         0
relationship       0
race               0
sex                0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income_category    0
dtype: int64

In [8]:
(df.values == '?').sum()

0

In [9]:
(df.values == np.NaN).sum()

0

In [11]:
df = df.replace("?",np.nan)
df.dropna(inplace=True)

**Data Transformation**

In [12]:
df.shape

(32561, 15)

In [15]:
from sklearn.preprocessing import LabelEncoder


In [16]:
df['sex'].unique()

array([' Male', ' Female'], dtype=object)

In [55]:
df.loc[df['sex']==' Male','sex']=0
df.loc[df['sex']==' Female','sex']=1
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income_category
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,2,0,0,40,Cuba,0


In [56]:
df['income_category'].unique()

array([0, 1], dtype=object)

In [57]:
df.loc[df['income_category']==' <=50K','income_category']=0
df.loc[df['income_category']==' >50K','income_category']=1
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income_category
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,2,0,0,40,Cuba,0


**Error correcting(Outlier detection and removal)**

In [60]:
def outlier_remover(data,threshold=3):
  numeric_cols = data.select_dtypes(include=[np.number])
  zscore = np.abs((numeric_cols-numeric_cols.mean())/numeric_cols.std())
  out = zscore > 3
  data = data[~out.any(axis=1)]

  return data
df = outlier_remover(df)

In [61]:
df.shape

(27476, 15)

**Build Data model using regression and Naïve Bayes methods and compare accuracy
of benign and malignant tumors in Breast Cancer Dataset.**

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [62]:
x = df[['age','fnlwgt','education-num','sex','capital-gain','capital-loss','hours-per-week']]
y = df['income_category']

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)


In [64]:
y_train = y_train.astype(int)


In [45]:
reg = LogisticRegression()
reg.fit(x_train,y_train)


In [46]:
pred_lr = reg.predict(x_test)

In [51]:
y_test = y_test.astype(int)

# Convert pred_lr to integer type if necessary
pred_lr = pred_lr.astype(int)

# Now calculate accuracy
print("Accuracy by Logistic Regression is: ", accuracy_score(y_test, pred_lr))

Accuracy by Logistic Regression is:  0.8133869706112414


In [52]:
naive = GaussianNB()
naive.fit(x_train,y_train)

In [53]:
pred_nb = naive.predict(x_test)

In [54]:
print("Accuracy by Naive Bayes is: ", accuracy_score(y_test, pred_nb))

Accuracy by Naive Bayes is:  0.8035534696614147
