In [12]:
#import important libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import warnings
warnings.filterwarnings('ignore')

In [13]:
#Reading file
df = pd.read_csv('classification_2.csv', names=[
    "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
    "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
    "Hours per week", "Country", "Target"],
                 sep=r'\s*,\s*',
                 engine='python',
                 na_values="?")

In [14]:
#getting shape of our data
df.shape

(32561, 15)

In [15]:
#checking data
df.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
#getting details of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
Age               32561 non-null int64
Workclass         30725 non-null object
fnlwgt            32561 non-null int64
Education         32561 non-null object
Education-Num     32561 non-null int64
Martial Status    32561 non-null object
Occupation        30718 non-null object
Relationship      32561 non-null object
Race              32561 non-null object
Sex               32561 non-null object
Capital Gain      32561 non-null int64
Capital Loss      32561 non-null int64
Hours per week    32561 non-null int64
Country           31978 non-null object
Target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [17]:
#looking details of data
df.describe()

Unnamed: 0,Age,fnlwgt,Education-Num,Capital Gain,Capital Loss,Hours per week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [18]:
#finding null values if any
df.isna().any()

Age               False
Workclass          True
fnlwgt            False
Education         False
Education-Num     False
Martial Status    False
Occupation         True
Relationship      False
Race              False
Sex               False
Capital Gain      False
Capital Loss      False
Hours per week    False
Country            True
Target            False
dtype: bool

In [23]:
#finding null values
df.isnull().sum()

Age               0
Workclass         0
fnlwgt            0
Education         0
Education-Num     0
Martial Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital Gain      0
Capital Loss      0
Hours per week    0
Country           0
Target            0
dtype: int64

In [24]:
df.fillna("Missing", inplace=True)

for name in df.columns:
    if df[name].dtype != "int64":
        df[name] = pd.get_dummies(df[name])

for name in df.columns:
            if 'Target' == name :
                pass
            else:
                temp = 0
                temp_arr = np.array([])
                df[name] = (df[name] - df[name].mean()) / np.nanstd(df[name])



In [27]:
#again checking for null values
df.isnull().any()

Age               False
Workclass         False
fnlwgt            False
Education         False
Education-Num     False
Martial Status    False
Occupation        False
Relationship      False
Race              False
Sex               False
Capital Gain      False
Capital Loss      False
Hours per week    False
Country           False
Target            False
dtype: bool

In [25]:
#dividing data in x and y
x_data_set = np.array(pd.DataFrame(df, columns = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
    "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
    "Hours per week", "Country"]))
y_data_set = np.array(pd.DataFrame(df, columns = ['Target']))

In [26]:

train_size = int(np.ceil((len(y_data_set) * 0.80)))
test_size = int(len(y_data_set) - train_size)

#Training data
x_train_data = np.array(x_data_set[:train_size])
y_train_data = np.array(y_data_set[:train_size])

#Testing data
x_test_data = np.array(x_data_set[train_size:])
y_test_data = np.array(y_data_set[train_size:])
# print(y_train_data)
size = [x_train_data.shape]
size = size[0][1] + 1


x_train_data = x_train_data[:2000]
y_train_data = y_train_data[:2000]

x_test_data = x_test_data[:1000]
y_test_data = y_test_data[:1000]


y_pred = np.array([])

for i in range(int(len(x_test_data))):
    distance = np.array([])
    length = x_train_data.shape[1]
    for j in range(int(len(x_train_data))):
        temp = 0
        for k in range(length):
            temp += np.square(x_test_data[i][k] - x_train_data[j][k])
        distance = np.append(distance, temp)
    # print("distance.shape : ",distance.shape)
    sorted_data = np.sort(distance)

    k = 3
    neighbors = np.array([])
    for i in range(k):
        neighbors = np.append(neighbors, distance[i])

    count_0 = 0
    count_1 = 0
    for i in range(len(neighbors)):
        if neighbors[i] == 1:
            count_0 += 1
        elif neighbors[i] == 0:
            count_1 += 1


    if count_0 > count_1:
        y_pred = np.append(y_pred, 1)
    else:
        y_pred = np.append(y_pred, 0)


acc = 0
count_1 = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test_data[i]:
        count_1 +=1

acc = (count_1 / len(y_test_data)) * 100
print("accuracy = ", acc )

accuracy =  74.8
