In [50]:
from numpy.random import choice
from math import sqrt

class MyKMeans():
    def __init__(self, nr_centroizi) -> None:
        self.nr_centroizi = nr_centroizi
        self.centroizi = []

    def initialize_centroizi(self, input):
        index = [i for i in range(input.shape[0])]
        indexes = choice(index, self.nr_centroizi)
        self.centroizi = [input[i] for i in indexes]

    def distance(self, v1, v2):
        values = [(v1[0,i] - v2[0,i])*(v1[0,i] - v2[0,i])for i in range(v1.shape[1])]
        return sqrt(sum(values))

    def minimum_centroid_index(self, x):
        indice = 0
        minim_distance = self.distance(x,self.centroizi[indice])
        for i in range(len(self.centroizi)):
            distance = self.distance(x,self.centroizi[i])
            if distance < minim_distance:
                minim_distance = distance
                indice = i
        
        return indice
    
    def _upper_sum(self, input, c, j):
        return sum([input[i] for i in range(input.shape[0]) if c[i] == j])
    
    def _lowwer_sum(self, c, j):
        return sum([1 for i in range(len(c)) if c[i] == j])

    def train(self, training_input):
        self.initialize_centroizi(training_input)

        convergent = False

        while not convergent:
            c = []
            for i in range(training_input.shape[0]):
                x = training_input[i]
                indice = self.minimum_centroid_index(x)
                c.append(indice)

            max_change = -1
            for j in range(0, self.nr_centroizi):
                new_centroid = self._upper_sum(training_input,c,j) / self._lowwer_sum(c,j)
                distance = self.distance(self.centroizi[j],new_centroid)
                if distance > max_change:
                    max_change = distance
                self.centroizi[j]=new_centroid

            if max_change < 0.04:
                convergent = True
        
    def predict(self, input):
        indexes = [self.minimum_centroid_index(x) for x in input]
        return indexes

In [51]:
import pandas as pd
import numpy as np

In [52]:
def read_datas(file_path:str):
    df = pd.read_csv(file_path)
    return df

def get_training_and_validation_datas(df: pd.DataFrame, training_size = 0.8):
    data_size = df.shape[0]
    indexes = [i for i in range(data_size)]
    training_index = np.random.choice(indexes,int(data_size*training_size))
    validation_index = [i for i in range(data_size) if not i in training_index]
    training_input = [df['Text'].iloc[index] for index in training_index]
    training_output = [df['Sentiment'].iloc[index] for index in training_index]
    validation_input = [df['Text'].iloc[index] for index in validation_index]
    validation_output = [df['Sentiment'].iloc[index] for index in validation_index]
    return training_input, training_output, validation_input, validation_output

def get_TP_TN_FP_FN(computed_output, ground_truth, positive_label):
    TP, TN, FP, FN = 0, 0, 0, 0
    for i in range(len(computed_output)):
        if computed_output[i] == positive_label:
            if ground_truth[i] == positive_label:
                TP += 1
            else:
                FP += 1
        else:
            if ground_truth[i] == positive_label:
                FN += 1
            else:
                TN += 1
    return TP, TN, FP, FN

def get_accuracy(TP, TN, FP, FN):
    return (TP + TN) / (TP + TN + FP + FN)

def get_precision(TP, TN, FP, FN):
    return TP/(TP+FP)

def get_recall(TP, TN, FP, FN):
    return TP/(TP+FN)

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
def get_bags_of_words(training_input, validation_input):
    vectorizer = CountVectorizer()
    train_features = vectorizer.fit_transform(training_input)
    validation_features = vectorizer.transform(validation_input)
    return train_features, validation_features

In [55]:
def get_classifier(training_input, number_of_clusters:int):
    unsupervisedClassifier = MyKMeans(nr_centroizi=number_of_clusters)
    unsupervisedClassifier.train(training_input)
    return unsupervisedClassifier

In [56]:
def test_classifier(classifier:MyKMeans,validation_input, validation_output, label_names,positive_label):
    computedTestIndexes = classifier.predict(validation_input)
    computed_outputs = [label_names[value] for value in computedTestIndexes]
    TP, TN, FP, FN = get_TP_TN_FP_FN(computed_outputs,validation_output,positive_label)
    accuracy = get_accuracy(TP, TN, FP, FN)
    precision = get_precision(TP, TN, FP, FN)
    recall = get_recall(TP, TN, FP, FN)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}")

In [58]:
dataframe = read_datas('reviews_mixed.csv')
training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(dataframe)
test_feats, validation_feats = get_bags_of_words(training_input,validation_input)
label_names = [name for name in set(training_output)]
classifier = get_classifier(test_feats,len(label_names))
test_classifier(classifier,validation_feats,validation_output,label_names,'positive')

Accuracy: 0.3010752688172043
Precision: 0.29347826086956524
Recall: 1.0
