CSCE 4143: Data Mining\
Xintao Wu, Ph.D.\
Practice Project\
Group 7: Levi Crider, Caleb Holmes, Spencer Smith, and Ethan Weems\
Step 2: Clustering

Import Packages

In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

Import Data from Adult Data File

In [97]:
labels = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "actual-value"]
numeric_labels = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
df = pd.read_table("./adult/adult.data", names=labels, delimiter=",")

Clean Data\
-Remove unknown values\
-Split numerical attributes into binary attributes

In [98]:
df = df.astype(str)

# using list comprehension
curr_labels = [i for i in labels if i not in numeric_labels]

substring = '\?' # character indicating null value
for label in curr_labels:
   filter = df[label].str.contains(substring) 
   df = df[~filter]

df = df.reset_index(drop=True)

df[numeric_labels] = df[numeric_labels].astype(float)

mean_values = df[numeric_labels].mean()

for num_label, threshold in mean_values.items():
    df[num_label] = df[num_label].apply(lambda x: 1 if x >= threshold else 0)

df[numeric_labels]

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,1,0,1,1,0,0
1,1,0,1,0,0,0
2,0,1,0,0,0,0
3,1,1,0,0,0,0
4,0,1,1,0,0,0
...,...,...,...,...,...,...
30157,0,1,1,0,0,0
30158,1,0,0,0,0,0
30159,1,0,0,0,0,0
30160,0,1,0,0,0,0


Perform One-Hot Encoding and Split Data

In [99]:
X = df[curr_labels[0 : len(curr_labels) - 1]]
y = df["actual-value"].values

one_hot = OneHotEncoder()
label_encoder = LabelEncoder()
x_encode = one_hot.fit_transform(X).toarray()
y_encode = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(x_encode, y_encode, test_size=0.2)


Build K-means Clustering Algorithm 

In [100]:
def createKMeansCluster(num):
    # create and train model
    kmeans = KMeans(n_clusters=num, random_state=42)
    kmeans.fit(X_train)
    
    centroids = kmeans.cluster_centers_
    
    return centroids

3 Clusters

In [101]:
createKMeansCluster(3)



array([[ 2.64169068e-02,  4.94716619e-02,  7.89305155e-01,
         2.36951649e-02,  7.23663144e-02,  3.82644893e-02,
         4.80307397e-04,  3.47422350e-02,  4.94716619e-02,
         2.11335255e-02,  4.64297150e-03,  1.10470701e-02,
         1.64905540e-02,  1.80915786e-02,  3.26609030e-02,
         3.18603907e-02,  1.54498879e-01,  7.36471342e-03,
         3.36375280e-01,  3.73038745e-02,  2.40153698e-03,
         1.24879923e-02,  2.29426833e-01,  2.14377201e-01,
         3.68628739e-18,  9.60614793e-04,  2.25744476e-02,
         6.95965418e-01,  4.75504323e-02,  1.85718860e-02,
         7.54082613e-02,  8.00512328e-04,  1.68908101e-01,
         9.51008646e-02,  4.67499199e-02,  9.73422991e-02,
         6.86839577e-02,  1.17195005e-01,  8.00512328e-04,
         1.00064041e-01,  2.43355748e-02,  1.07588857e-01,
         2.91386487e-02,  6.78834454e-02,  2.60902411e-15,
         5.34582133e-01,  5.63560679e-02,  3.14921550e-01,
         9.41402498e-02,  5.55111512e-17,  1.10470701e-0

5 Clusters

In [102]:
createKMeansCluster(5)



array([[ 2.89369479e-02,  5.23911057e-02,  7.17636308e-01,
         4.41669205e-02,  1.27931770e-01,  2.80231496e-02,
         9.13798355e-04,  2.08166817e-17, -6.93889390e-18,
        -1.38777878e-17, -1.64798730e-17, -2.94902991e-17,
        -4.51028104e-17,  3.64291930e-17, -1.87350135e-16,
        -4.85722573e-17,  2.49800181e-16,  2.60208521e-17,
         1.00000000e+00, -1.17961196e-16, -6.28837260e-18,
         7.28583860e-17,  1.08246745e-15,  1.11022302e-16,
         1.21839781e-03,  9.98781602e-01,  2.77555756e-17,
        -1.16573418e-15,  6.24500451e-17,  4.85722573e-17,
         5.48279013e-02,  3.04599452e-04,  2.86323485e-01,
         9.83856229e-02,  5.33049041e-02,  4.87359123e-02,
         1.16661590e-01,  5.23911057e-02,  0.00000000e+00,
         2.28449589e-02,  2.65001523e-02,  1.05696010e-01,
         1.52299726e-02,  1.18793786e-01,  9.91166616e-01,
         3.04599452e-04,  3.65519342e-03,  4.87359123e-03,
        -2.22044605e-16,  2.08166817e-16,  1.06609808e-0

10 Clusters

In [103]:
createKMeansCluster(10)



array([[ 3.57336221e-02,  5.68489442e-02,  7.15213860e-01,
         4.06063887e-02,  1.14239307e-01,  3.73578776e-02,
        -7.04731412e-19,  3.35679480e-02,  3.19436925e-02,
         1.19112074e-02,  5.95560368e-03,  1.46182999e-02,
         2.21981592e-02,  1.78668110e-02,  4.00649702e-02,
         3.89821332e-02,  1.35354629e-01,  1.02869518e-02,
         3.70871684e-01,  4.76448295e-02,  2.16567407e-03,
         1.84082296e-02,  1.98159177e-01,  7.23876557e-01,
        -4.33680869e-19,  2.70709258e-03,  7.03844071e-02,
        -2.22044605e-16,  1.40768814e-01,  6.22631294e-02,
         5.46832702e-02, -2.16840434e-19,  2.24688684e-01,
         1.32647537e-01,  4.11478073e-02,  6.22631294e-02,
         7.57985923e-02,  6.60530590e-02,  0.00000000e+00,
         9.69139145e-02,  3.19436925e-02,  9.63724959e-02,
         2.43638332e-02,  9.31239848e-02,  9.99200722e-16,
         6.92474283e-01,  3.68164591e-02,  7.03844071e-02,
         2.00324851e-01,  4.85722573e-17,  1.08283703e-0

K-Nearest Neighbor Algorithm

In [104]:
def kNN_Wrapper(num):
    # use last 10 records from test data
    data = X_test[len(X_test) - 10: len(X_test)]
    
    knn = KNeighborsClassifier(n_neighbors=num)
    
    knn.fit(X_train, y_train)
    
    predictions = knn.predict(data)
    predictions
    
    accuracy = accuracy_score(y_test[len(y_test) - 10 : len(y_test)], predictions)
    print(f"Accuracy: {accuracy:.2f}")
    
    print(classification_report(y_test[len(y_test) - 10 : len(y_test)], predictions))

kNN 3

In [105]:
kNN_Wrapper(3)

Accuracy: 0.60
              precision    recall  f1-score   support

           0       0.67      0.86      0.75         7
           1       0.00      0.00      0.00         3

    accuracy                           0.60        10
   macro avg       0.33      0.43      0.38        10
weighted avg       0.47      0.60      0.53        10



kNN 5

In [106]:
kNN_Wrapper(5)

Accuracy: 0.60
              precision    recall  f1-score   support

           0       0.67      0.86      0.75         7
           1       0.00      0.00      0.00         3

    accuracy                           0.60        10
   macro avg       0.33      0.43      0.38        10
weighted avg       0.47      0.60      0.53        10



kNN 10

In [107]:
kNN_Wrapper(10)

Accuracy: 0.60
              precision    recall  f1-score   support

           0       0.67      0.86      0.75         7
           1       0.00      0.00      0.00         3

    accuracy                           0.60        10
   macro avg       0.33      0.43      0.38        10
weighted avg       0.47      0.60      0.53        10

