In [1]:
import pandas as pd
import numpy as np

In [2]:
def read_datas(file_path:str):
    df = pd.read_csv(file_path)
    return df

def get_training_and_validation_datas(df: pd.DataFrame, training_size = 0.8):
    data_size = df.shape[0]
    indexes = [i for i in range(data_size)]
    training_index = np.random.choice(indexes,int(data_size*training_size))
    validation_index = [i for i in range(data_size) if not i in training_index]
    training_input = [df['Text'].iloc[index] for index in training_index]
    training_output = [df['Sentiment'].iloc[index] for index in training_index]
    validation_input = [df['Text'].iloc[index] for index in validation_index]
    validation_output = [df['Sentiment'].iloc[index] for index in validation_index]
    return training_input, training_output, validation_input, validation_output

def get_TP_TN_FP_FN(computed_output, ground_truth, positive_label):
    TP, TN, FP, FN = 0, 0, 0, 0
    for i in range(len(computed_output)):
        if computed_output[i] == positive_label:
            if ground_truth[i] == positive_label:
                TP += 1
            else:
                FP += 1
        else:
            if ground_truth[i] == positive_label:
                FN += 1
            else:
                TN += 1
    return TP, TN, FP, FN

def get_accuracy(TP, TN, FP, FN):
    return (TP + TN) / (TP + TN + FP + FN)

def get_precision(TP, TN, FP, FN):
    if TP + FP == 0:
        return 0  # Return zero precision when denominator is zero
    else:
        return TP / (TP + FP)


def get_recall(TP, TN, FP, FN):
    return TP/(TP+FN)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

#BAG OF WORDS
def get_bags_of_words(training_input, validation_input):
    vectorizer = CountVectorizer()
    train_features = vectorizer.fit_transform(training_input)
    validation_features = vectorizer.transform(validation_input)
    return train_features, validation_features

In [4]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
def get_tf_idf(training_input, validation_input, max_feats=50):
    vectorizer = TfidfVectorizer(max_features=max_feats)
    train_features = vectorizer.fit_transform(training_input)
    validation_features = vectorizer.transform(validation_input)
    return train_features, validation_features

In [5]:
from sklearn.cluster import KMeans

def get_classifier(training_input, number_of_clusters:int):
    unsupervisedClassifier = KMeans(n_clusters=number_of_clusters, random_state=0)
    unsupervisedClassifier.fit(training_input)
    return unsupervisedClassifier

def test_classifier(classifier:KMeans,validation_input, validation_output, label_names,positive_label):
    computedTestIndexes = classifier.predict(validation_input)
    computed_outputs = [label_names[value] for value in computedTestIndexes]
    TP, TN, FP, FN = get_TP_TN_FP_FN(computed_outputs,validation_output,positive_label)
    accuracy = get_accuracy(TP, TN, FP, FN)
    precision = get_precision(TP, TN, FP, FN)
    recall = get_recall(TP, TN, FP, FN)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}")
    
dataframe = read_datas('reviews_mixed.csv')
training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(dataframe)
test_feats, validation_feats = get_bags_of_words(training_input,validation_input)
label_names = [name for name in set(training_output)]
classifier = get_classifier(test_feats,len(label_names))
test_classifier(classifier,validation_feats,validation_output,label_names,'positive')

Accuracy: 0.3469387755102041
Precision: 0.3469387755102041
Recall: 1.0
