In [4]:
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn import datasets
from sklearn.cluster import KMeans
import numpy as np

def kmeanalgor():
    
    X = np.array([[1,2],[1,4],[1,0],[10,2],[10,4],[10,0]])
    kmeans = KMeans(n_clusters = 2, random_state = 0).fit(X)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    predicted = kmeans.predict([[0,0],[12,3]])
    print("Labels: ",labels)
    print("Cluster centers: ", centers)
    print("Predicted class: ",predicted)
    
kmeanalgor()

Labels:  [1 1 1 0 0 0]
Cluster centers:  [[10.  2.]
 [ 1.  2.]]
Predicted class:  [1 0]


In [5]:
from sklearn import metrics

def RandIndex():
    
    labels_true = [0,0,0,1,1,1]
    labels_pred = [0,0,1,1,2,2]
    
    rand_index = metrics.rand_score(labels_true, labels_pred)
    adjusted_rand_index = metrics.adjusted_rand_score(labels_true, labels_pred)
    print("Rand_index: ",rand_index)
    print("Adjusted_rand_index: ", adjusted_rand_index)
    
RandIndex()

Rand_index:  0.6666666666666666
Adjusted_rand_index:  0.24242424242424243


In [6]:
def MutualInfo():
    
    labels_true = [0,0,0,1,1,1]
    labels_pred = [0,0,1,1,2,2]
    
    ami = metrics.adjusted_mutual_info_score(labels_true,labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true,labels_pred)
    mis = metrics.mutual_info_score(labels_true, labels_pred)
    
    print("Adjusted Mutual Info Score: ", ami)
    print("Normalized Mutual Info Score: ", nmi)
    print("Mutual Info Score: ",mis)
    
MutualInfo()

Adjusted Mutual Info Score:  0.2987924581708901
Normalized Mutual Info Score:  0.5158037429793889
Mutual Info Score:  0.4620981203732969


In [7]:
def Multiscores():
    
    labels_true = [0,0,0,1,1,1]
    labels_pred = [0,0,1,1,2,2]
    
    h = metrics.homogeneity_score(labels_true, labels_pred)
    c = metrics.completeness_score(labels_true, labels_pred)
    v = metrics.v_measure_score(labels_true, labels_pred)
    hcv = metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
    
    print("Homogeneity Score: ", h)
    print("Completeness Score: ", c)
    print("V_measure Score: ", v)
    print("Homogeneity, comleteness, and V-measure: ", hcv)
    
Multiscores()

Homogeneity Score:  0.6666666666666669
Completeness Score:  0.420619835714305
V_measure Score:  0.5158037429793889
Homogeneity, comleteness, and V-measure:  (0.6666666666666669, 0.420619835714305, 0.5158037429793889)


In [9]:
def FMIscore():
    
    labels_true = [0,0,0,1,1,1]
    labels_pred = [0,0,1,1,2,2]
    
    fmi = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    
    print("Fowlkes-Mallows score: ", fmi)
    
FMIscore()

Fowlkes-Mallows score:  0.4714045207910317


In [11]:
def Silhouettescore():
    
    X,y = datasets.load_iris(return_X_y=True)
    kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
    labels = kmeans_model.labels_
    s = metrics.silhouette_score(X,labels)
    print("Silhouette Score: ",s)

Silhouettescore()

Silhouette Score:  0.5528190123564091


In [12]:
def CHscore():
    
    X,y = datasets.load_iris(return_X_y=True)
    kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
    labels = kmeans_model.labels_
    ch = metrics.calinski_harabasz_score(X,labels)
    print("Calinski-Harabasz Score: ",ch)

CHscore()

Calinski-Harabasz Score:  561.62775662962


In [13]:
def DBscore():
    
    X,y = datasets.load_iris(return_X_y=True)
    kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
    labels = kmeans_model.labels_
    db = metrics.davies_bouldin_score(X,labels)
    print("Davies Bouldin Score: ",db)

DBscore()

Davies Bouldin Score:  0.6619715465007528


In [19]:
from os import listdir
from os.path import isfile, join
import random

def read_data(train_path_class0, train_path_class1,test_path_class0, test_path_class1):
    #read X_train, Y_train
    X_train_class0 = [f for f in listdir(train_path_class0) if isfile(join(train_path_class0,f))]
    Y_train_class0 = [0]*len(X_train_class0)
    X_train_class1 = [f for f in listdir(train_path_class1) if isfile(join(train_path_class1,f))]
    Y_train_class1 = [0]*len(X_train_class1)
    X_train = X_train_class0 + X_train_class1
    Y_train = Y_train_class0 + Y_train_class1
    
    #shuffle X_train and Y_train
    Z = list(zip(X_train, Y_train))
    random.shuffle(Z)
    X_train, Y_train = zip(*Z)
    
    #read X_test, Y_test
    X_test_class0 = [f for f in listdir(test_path_class0) if isfile(join(test_path_class0,f))]
    Y_test_class0 = [0]*len(X_test_class0)
    X_test_class1 = [f for f in listdir(test_path_class1) if isfile(join(test_path_class1,f))]
    Y_test_class1 = [0]*len(X_test_class1)
    X_test = X_test_class0 + X_test_class1
    Y_test = Y_test_class0 + Y_test_class1
    
    #shuffle X_test and Y_test
    Z = list(zip(X_test, Y_test))
    random.shuffle(Z)
    X_test,Y_test = zip(*Z)
    
    return X_train, Y_train, X_test, Y_test

train_path_class0 = "C:/Users/Joseph Yau/Projects_Friday/11.12/20news-bydate/20news-bydate-train/alt.atheism"

train_path_class1 = "C:/Users/Joseph Yau/Projects_Friday/11.12/20news-bydate/20news-bydate-train/comp.graphics"

test_path_class0 = "C:/Users/Joseph Yau/Projects_Friday/11.12/20news-bydate/20news-bydate-test/alt.atheism"

test_path_class1 = "C:/Users/Joseph Yau/Projects_Friday/11.12/20news-bydate/20news-bydate-test/comp.graphics"

X_train, Y_train, X_test, Y_test = read_data(train_path_class0, train_path_class1, test_path_class0, test_path_class1)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

def train(X_train, Y_trian, X_test, Y_test):
    #Tokenizing the texts
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    
    #Calculate TfidfTransformer
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    #Learning
    clf = MultinomialNB().fit(X_train_tfidf,Y_train)
    
    #Predicting
    X_test_counts = count_vect.transform(X_test)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    Y_predicted = clf.predict(X_test_tfidf)
    
    print(accuracy_score(Y_test,Y_predicted))

train(X_train, Y_train, X_test, Y_test)

1.0
