In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

class DataPack:
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

## Điều kiện sử dụng: Cả 2 tập dữ liệu train/test đều phải cùng 1 kiểu (cùng 1 bộ hoặc là cùng cấu trúc, quy tắc...)
class DataReader:

    def __init__(self):
        self.train = None
        self.test = None
        self.encoders = []                      ## các encoder cho tập dữ liệu này
        self.atrNames = []                     ## tên các thuộc tính
        self.atrVals = []                        ## tập giá trị của từng thuộc tính (-1 nếu là biến liên tục)

    ## đọc file ban đầu, nạp vào các biến
    def ReadDataDescription(self, url):
        f = open(url)
        lines = f.readlines()
        names = []
        vals = []
        for line in lines:
            _line = line.split(':')
            _line[0] = _line[0].strip(' \t\n\r')        # xóa hết dấu cách và enter
            names.append(_line[0])
            val = _line[1].split(',')
            for i in range(0, len(val)):
                val[i] = [val[i].strip(' \t\n\r')]
            vals.append(val)
        print("Data attributes found: ", names)
        self.__Initialize(names, vals)

    def __Initialize(self, atrNames, atrVals):
        self.atrNames = atrNames
        self.atrVals = atrVals
        ## self.__EncodeData()

    def __IsContinuous(self, idx):
        return (self.atrVals[idx] == [['continuous']])

    ## trả về dataPack 
    def readData(self, url):
        df = pd.read_csv(url, header=0)
        df.dropna()
        print("Data shape = ", df.shape)
        self.__TrimStrings(df)
        self.__FindNull(df)
        print(df.columns)

        encodedColumns = pd.DataFrame()
        ## encode các cột theo dạng category => one hot encoding
        for i in range(0, len(self.atrNames) ):
            #if (self.encoders[i] != None):          # không phải None => có encode rồi
            if not (self.__IsContinuous(i)):
                #enc = self.encoders[i]              # lấy encoder tương ứng với cột này
                col = df.columns[i]
                
                # duyệt từng phần tử trong cột và encode
                # newDF = pd.DataFrame()
                # for entry in df[col]:
                #     tmpDF = pd.DataFrame( enc.transform([ [entry] ]).toarray() )
                #     pd.concat([newDF, tmpDF], axis=0)
                newDF = pd.get_dummies(df[col], prefix=df.columns[i])
                ## print(newDF, '\n\n')

                encodedColumns = pd.concat([encodedColumns, newDF], axis=1)
                print("Column", df.columns[i], "encoded.")
        
        dropIdx = []
        for i in range(0, len(self.atrNames) ):
            if not (self.__IsContinuous(i)):
                col = df.columns[i]
                print("Deleting column : ", col)
                dropIdx.append(col)
        df.drop(columns=dropIdx, inplace=True)
        
        ##print(encodedColumns)
        result = pd.concat( [df, encodedColumns], axis=1 )   

        labels = result["label"]
        result.drop(columns="label", inplace=True)            #gỡ ra khỏi dataframe để khỏi bị trùng lắp
       ## labels_encoded = pd.get_dummies(labels)

        features = result
        dataPack = DataPack(features, labels)
        print("Read file succesfully.")
        print( pd.concat([result, labels], axis=1) )
        return dataPack

    def readTrainData(self, url):
        print("[!] Train data reading...\n\n")
        pack = self.readData(url)
        self.train = pack
        print("[!] Train data read successfully!\n\n")

    def readTestData(self, url):
        print("[!] Test data reading...\n\n")
        pack = self.readData(url)
        self.test = pack
        print("[!] Test data read successfully!\n\n")

    ## dữ liệu có dấu '?' tức là bị thiếu/mất
    def __FindNull(self, df):
        rows = df.shape[0]
        cols = df.shape[1]
        print("Finding missing values... ")
        for i in range(0, rows):
            for j in range(0, cols):
                if (df.iat[i,j] == '?'):
                    df.iat[i,j] = np.nan


    def __TrimStrings(self, df: pd.DataFrame()):
        rows = df.shape[0]
        cols = df.shape[1]
        print("Trimming strings...")
        for i in range(0, rows):
            for j in range(0, cols):
                if (type( df.iat[i,j] ) == str):
                    df.iat[i,j] = df.iat[i,j].strip(' \t\n\r')

    ## encode nó sang dạng one-hot để có thể chạy được trên các mô hình
    def __EncodeData(self):
        print("Encoding categories...")
        for i in range(0, len(self.atrNames) ):
            if (self.atrVals[i] == [['continuous']]):
                self.encoders.append(None)
            else:
                vals = self.atrVals[i]            ## 2D array
                encoder = OneHotEncoder(handle_unknown = 'ignore')
                encoder.fit(vals)
                self.encoders.append(encoder)
                print("Attribute ", self.atrNames[i], " encoded.")


In [2]:
from sklearn import preprocessing

In [3]:
dataReader = DataReader()
# dataReader.ReadDataDescription('data/adult.names.txt')
dataReader.readTrainData(r'C:\Users\Thinh\Desktop\TUDvTK_1712052_1712244_Classification-master\data\adult.train.csv')
dataReader.readTestData(r'C:\Users\Thinh\Desktop\TUDvTK_1712052_1712244_Classification-master\data\adult.test.csv')

[!] Train data reading...


Data shape =  (32561, 15)
Trimming strings...
Finding missing values... 
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'label'],
      dtype='object')
Read file succesfully.
       age         workclass  fnlwgt   education  education-num  \
0       39         State-gov   77516   Bachelors             13   
1       50  Self-emp-not-inc   83311   Bachelors             13   
2       38           Private  215646     HS-grad              9   
3       53           Private  234721        11th              7   
4       28           Private  338409   Bachelors             13   
...    ...               ...     ...         ...            ...   
32556   27           Private  257302  Assoc-acdm             12   
32557   40           Private  154374     HS-grad              9   
32558   58           P

In [4]:
X = dataReader.train.features
testX = dataReader.test.features

In [5]:
# X = pd.get_dummies(data = X, columns = ['workclass', 'fnlwgt', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])
X = pd.get_dummies(data = X, columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])
testX = pd.get_dummies(data = testX, columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])
col_list = (X.append([testX])).columns.tolist()

X = X.loc[:, col_list].fillna(0)
testX = testX.loc[:, col_list].fillna(0)

fnlwgt = {}
marital_status = {}
occupation = {}
relationship = {}
workclass = {}
race = {}
sex = {}
native_country = {}
columns = ['occupation', 'workclass', 'sex']
dict_list = [occupation, workclass, sex]

# delete_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
# delete_features = ['education']
def preprocessingData(X):
#     for i in delete_features:
#         X.drop(columns = i, axis=1, inplace = True)
#     for index, row in X.iterrows():
#         for i in range(len(columns)):
#             if row[columns[i]] not in dict_list[i]:
#                 dict_list[i][row[columns[i]]] = len(dict_list[i])
#             X.loc[index, columns[i]] = dict_list[i][row[columns[i]]]

#         if row['marital-status'] not in marital_status:
#             marital_status[row['marital-status']] = len(marital_status)
#         X.loc[index, 'marital-status'] = marital_status[row['marital-status']]

#         if row['native-country'] not in native_country:
#             native_country[row['native-country']] = len(native_country)
#         X.loc[index, 'native-country'] = native_country[row['native-country']]

    needScale = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    for i in needScale:
        X[i] = X[i]/(X[i].max())
    return X

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [6]:
X = preprocessingData(X)


In [7]:
X = X.to_numpy()

In [8]:
y = dataReader.train.labels

In [9]:
for i in range(len(y)):
    if(y[i] == "<=50K"):
        y[i] = 0
    else:
        y[i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
X = X.T
y = y.T
X = np.concatenate((np.ones((1, X.shape[1])), X), axis = 0)

In [11]:
def sigmoid(s):
    return 1/(1 + np.exp(-s))

def logistic_sigmoid_regression(X, y, w_init, eta, tol = 1e-4, max_count = 10000):
    w = [w_init]    
    it = 0
    N = X.shape[1]
    d = X.shape[0]
    count = 0
    check_w_after = 20
    while count < max_count:
        # mix data 
        mix_id = np.random.permutation(N)
        for i in mix_id:
            xi = X[:, i].reshape(d, 1)
            yi = y[i]
            zi = sigmoid(np.dot(w[-1].T, xi))
            w_new = w[-1] + eta*(yi - zi)*xi
            count += 1
            # stopping criteria
            if count%check_w_after == 0:                
                if np.linalg.norm(w_new - w[-check_w_after]) < tol:
                    return w
            w.append(w_new)
    return w

np.random.seed(2)
eta = .05 
d = X.shape[0]
w_init = np.random.randn(d, 1)
w = logistic_sigmoid_regression(X, y, w_init, eta)
np.save("LogisticRegression", w[-1])

In [12]:
res1 = np.dot(w[-1].T, X)
res1 = (res1-res1.mean())/res1.std()
result = sigmoid(res1)[0]

cnt = 0
# print(result)
threshold = [0]
while len(threshold) <= 500:
    threshold.append(threshold[-1] + 0.002)
    

maxAcc = -1
maxThreshold = -1
for j in threshold:
    cnt = 0
    for i in range(len(result)):
        if result[i] < j:
            cnt += (y[i] == 0)
        else:
            cnt += (y[i] == 1)
    if cnt > maxAcc:
        maxAcc = cnt
        maxThreshold = j
        
cnt = 0
print(maxThreshold)
for i in range(len(result)):
        if result[i] < maxThreshold:
            cnt += (y[i] == 0)
        else:
            cnt += (y[i] == 1)
            
print("Accuracy:", (cnt/len(result)) * 100, "%")

0.7160000000000005
Accuracy: 83.99004944565584 %


In [13]:
model = np.load("LogisticRegression.npy")
model

array([[-2.82848064],
       [ 1.78661481],
       [ 3.38132172],
       [ 2.36459297],
       [ 0.68547538],
       [-1.06064604],
       [-1.03659969],
       [-0.95548822],
       [-1.32112009],
       [-1.30389932],
       [-1.51184101],
       [-0.90108398],
       [ 0.09013051],
       [ 0.02496596],
       [ 0.73889474],
       [ 1.80408098],
       [-0.65217306],
       [ 0.91780443],
       [-0.92628362],
       [ 1.64873596],
       [-0.26922246],
       [ 0.24397896],
       [ 2.4380664 ],
       [-1.4412857 ],
       [-0.23416763],
       [ 0.07866051],
       [-1.32391147],
       [-2.03023065],
       [-1.47363428],
       [-0.95091834],
       [-0.16641284],
       [ 1.1655004 ],
       [-2.05298348],
       [-0.10105059],
       [ 0.37350253],
       [ 0.89170025],
       [ 0.45310446],
       [-0.72653837],
       [ 0.17752326],
       [ 0.58803157],
       [ 0.1552537 ],
       [ 0.53576036],
       [-1.75997019],
       [ 1.39886011],
       [ 1.46312815],
       [-0

In [14]:
testX = preprocessingData(testX)
testX = testX.to_numpy()
testY = dataReader.test.labels

In [15]:
testX = testX.T
# testY = testY.T
testX = np.concatenate((np.ones((1, testX.shape[1])), testX), axis = 0)

In [16]:
W = np.array(model)
W

array([[-2.82848064],
       [ 1.78661481],
       [ 3.38132172],
       [ 2.36459297],
       [ 0.68547538],
       [-1.06064604],
       [-1.03659969],
       [-0.95548822],
       [-1.32112009],
       [-1.30389932],
       [-1.51184101],
       [-0.90108398],
       [ 0.09013051],
       [ 0.02496596],
       [ 0.73889474],
       [ 1.80408098],
       [-0.65217306],
       [ 0.91780443],
       [-0.92628362],
       [ 1.64873596],
       [-0.26922246],
       [ 0.24397896],
       [ 2.4380664 ],
       [-1.4412857 ],
       [-0.23416763],
       [ 0.07866051],
       [-1.32391147],
       [-2.03023065],
       [-1.47363428],
       [-0.95091834],
       [-0.16641284],
       [ 1.1655004 ],
       [-2.05298348],
       [-0.10105059],
       [ 0.37350253],
       [ 0.89170025],
       [ 0.45310446],
       [-0.72653837],
       [ 0.17752326],
       [ 0.58803157],
       [ 0.1552537 ],
       [ 0.53576036],
       [-1.75997019],
       [ 1.39886011],
       [ 1.46312815],
       [-0

In [17]:
res1 = np.dot(W.T, testX)
res1 = (res1-res1.mean())/res1.std()
result = sigmoid(res1)[0]

In [18]:
cnt = 0
for i in range(len(result)):
    if result[i] < maxThreshold:
        cnt += (testY[i] == "<=50K." or testY[i] == "<=50K")
    else:
        cnt += (testY[i] == ">50K" or testY[i] == ">50K.")
print("Accuracy:", (cnt/len(result)) * 100, "%")

Accuracy: 84.09188624777347 %
