In [5]:
import pandas as pd
import numpy as np

In [7]:
def read_datas(file_path:str):
    df = pd.read_csv(file_path)
    return df

def get_training_and_validation_datas(df: pd.DataFrame, training_size = 0.8):
    data_size = df.shape[0]
    indexes = [i for i in range(data_size)]
    training_index = np.random.choice(indexes,int(data_size*training_size))
    validation_index = [i for i in range(data_size) if not i in training_index]
    training_input = [df['Text'].iloc[index] for index in training_index]
    training_output = [df['Sentiment'].iloc[index] for index in training_index]
    validation_input = [df['Text'].iloc[index] for index in validation_index]
    validation_output = [df['Sentiment'].iloc[index] for index in validation_index]
    return training_input, training_output, validation_input, validation_output

def get_TP_TN_FP_FN(computed_output, ground_truth, positive_label):
    TP, TN, FP, FN = 0, 0, 0, 0
    for i in range(len(computed_output)):
        if computed_output[i] == positive_label:
            if ground_truth[i] == positive_label:
                TP += 1
            else:
                FP += 1
        else:
            if ground_truth[i] == positive_label:
                FN += 1
            else:
                TN += 1
    return TP, TN, FP, FN

def get_accuracy(TP, TN, FP, FN):
    return (TP + TN) / (TP + TN + FP + FN)

def get_precision(TP, TN, FP, FN):
    return TP/(TP+FP)

def get_recall(TP, TN, FP, FN):
    return TP/(TP+FN)

#  🗟 Extragerea de caracteristici din text

## Bag of Words

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
def get_bags_of_words(training_input, validation_input):
    vectorizer = CountVectorizer()
    train_features = vectorizer.fit_transform(training_input)
    validation_features = vectorizer.transform(validation_input)
    return train_features, validation_features

## TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
def get_tf_idf(training_input, validation_input, max_feats=50):
    vectorizer = TfidfVectorizer(max_features=max_feats)
    train_features = vectorizer.fit_transform(training_input)
    validation_features = vectorizer.transform(validation_input)
    return train_features, validation_features

## Alte caracteristici

### Stemming

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Irimieş
[nltk_data]     Vasile\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [26]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def get_stemmed(training_input, validation_input):
    training_input_tokens = [ word_tokenize(text) for text in training_input ]
    validation_input_tokens = [word_tokenize(text) for text in validation_input]
    stemmer = PorterStemmer()
    
    training_input_stemmed_words = [[stemmer.stem(word) for word in words] for words in training_input_tokens]
    validation_input_stemmed_words = [[stemmer.stem(word) for word in words] for words in validation_input_tokens]

    training_input_stemmed_sentences = [' '.join(words) for words in training_input_stemmed_words]
    validation_input_stemmed_sentences = [' '.join(words) for words in validation_input_stemmed_words]
    
    return training_input_stemmed_sentences,validation_input_stemmed_sentences

In [19]:
s1_initial = ["We had requested two queen beds and got a room with 1 queen and 2 twins, we were advised that there were not any other rooms and could put the (2) beds together, which we did and it was no issue as far as sleeping."]
s2_initial = ["Waited over 40 minutes to use shower."]
v1,v2 = get_stemmed(s1_initial,s2_initial)
print("Before: ")
print(s1_initial)
print("After: ")
print(v1)
print()

print("Before: ")
print(s2_initial)
print("After: ")
print(v2)

Before: 
['We had requested two queen beds and got a room with 1 queen and 2 twins, we were advised that there were not any other rooms and could put the (2) beds together, which we did and it was no issue as far as sleeping.']
After: 
['we had request two queen bed and got a room with 1 queen and 2 twin , we were advis that there were not ani other room and could put the ( 2 ) bed togeth , which we did and it wa no issu as far as sleep .']

Before: 
['Waited over 40 minutes to use shower.']
After: 
['wait over 40 minut to use shower .']


### Stop words removal

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Irimieş
[nltk_data]     Vasile\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to C:\Users\Irimieş
[nltk_data]     Vasile\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def get_stop_words_removal(training_input, validation_input):
    training_input_tokens = [ word_tokenize(text) for text in training_input ]
    validation_input_tokens = [word_tokenize(text) for text in validation_input]
    stop_words = set(stopwords.words('english'))

    training_input_words_without_stop = [[word for word in words if word not in stop_words] for words in training_input_tokens]
    validation_input_words_without_stop = [[word for word in words if word not in stop_words] for words in validation_input_tokens]

    training_input_without_stop = [' '.join(words) for words in training_input_words_without_stop]
    validation_input_without_stop = [' '.join(words) for words in validation_input_words_without_stop]

    return training_input_without_stop,validation_input_without_stop


In [21]:
s1_initial = ["We had requested two queen beds and got a room with 1 queen and 2 twins, we were advised that there were not any other rooms and could put the (2) beds together, which we did and it was no issue as far as sleeping."]
s2_initial = ["Waited over 40 minutes to use shower."]
v1,v2 = get_stop_words_removal(s1_initial,s2_initial)
print("Before: ")
print(s1_initial)
print("After: ")
print(v1)
print()

print("Before: ")
print(s2_initial)
print("After: ")
print(v2)

Before: 
['We had requested two queen beds and got a room with 1 queen and 2 twins, we were advised that there were not any other rooms and could put the (2) beds together, which we did and it was no issue as far as sleeping.']
After: 
['We requested two queen beds got room 1 queen 2 twins , advised rooms could put ( 2 ) beds together , issue far sleeping .']

Before: 
['Waited over 40 minutes to use shower.']
After: 
['Waited 40 minutes use shower .']


### Lemmatization

In [25]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Irimieş
[nltk_data]     Vasile\AppData\Roaming\nltk_data...


True

In [27]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def get_lemmatization(training_input, validation_input):
    training_input_tokens = [ word_tokenize(text) for text in training_input ]
    validation_input_tokens = [word_tokenize(text) for text in validation_input]
    lemmatizer = WordNetLemmatizer()
    
    training_input_stemmed_words = [[lemmatizer.lemmatize(word) for word in words] for words in training_input_tokens]
    validation_input_stemmed_words = [[lemmatizer.lemmatize(word) for word in words] for words in validation_input_tokens]

    training_input_stemmed_sentences = [' '.join(words) for words in training_input_stemmed_words]
    validation_input_stemmed_sentences = [' '.join(words) for words in validation_input_stemmed_words]
    
    return training_input_stemmed_sentences,validation_input_stemmed_sentences

In [33]:
s1_initial = ["We had requested two queen beds and got a room with 1 queen and 2 twins, we were advised that there were not any other rooms and could put the (2) beds together, which we did and it was no issue as far as sleeping."]
s2_initial = ["My room was immaculate and smelled so fresh and clean"]
v1,v2 = get_lemmatization(s1_initial,s2_initial)
print("Before: ")
print(s1_initial)
print("After: ")
print(v1)
print()

print("Before: ")
print(s2_initial)
print("After: ")
print(v2)

Before: 
['We had requested two queen beds and got a room with 1 queen and 2 twins, we were advised that there were not any other rooms and could put the (2) beds together, which we did and it was no issue as far as sleeping.']
After: 
['We had requested two queen bed and got a room with 1 queen and 2 twin , we were advised that there were not any other room and could put the ( 2 ) bed together , which we did and it wa no issue a far a sleeping .']

Before: 
['My room was immaculate and smelled so fresh and clean']
After: 
['My room wa immaculate and smelled so fresh and clean']


# 🤖 kMeans

In [26]:
from sklearn.cluster import KMeans

In [37]:
def get_classifier(training_input, number_of_clusters:int):
    unsupervisedClassifier = KMeans(n_clusters=number_of_clusters, random_state=0)
    unsupervisedClassifier.fit(training_input)
    return unsupervisedClassifier

In [38]:
def test_classifier(classifier:KMeans,validation_input, validation_output, label_names,positive_label):
    computedTestIndexes = classifier.predict(validation_input)
    computed_outputs = [label_names[value] for value in computedTestIndexes]
    TP, TN, FP, FN = get_TP_TN_FP_FN(computed_outputs,validation_output,positive_label)
    accuracy = get_accuracy(TP, TN, FP, FN)
    precision = get_precision(TP, TN, FP, FN)
    recall = get_recall(TP, TN, FP, FN)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}")

In [40]:
dataframe = read_datas('reviews_mixed.csv')
training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(dataframe)
test_feats, validation_feats = get_bags_of_words(training_input,validation_input)
label_names = [name for name in set(training_output)]
classifier = get_classifier(test_feats,len(label_names))
test_classifier(classifier,validation_feats,validation_output,label_names,'positive')

Accuracy: 0.32967032967032966
Precision: 0.3448275862068966
Recall: 0.8823529411764706


Predictia pentru mesajul:

 > By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.

In [44]:
def predict_text(classifier:KMeans, label_names, text:str, training_input):
    input = [text]
    _, input = get_bags_of_words(training_input, input)
    output = classifier.predict(input)
    label = label_names[output[-1]]
    return label

In [45]:
text = "By choosing a bike over a car, I'm reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I'm proud to be part of that movement."
label = predict_text(classifier,label_names, text,training_input)
print(f"Label: {label}")

Label: positive


# Alternative la k-means si analiza performanta

## DBSCAN

In [11]:
from sklearn.cluster import DBSCAN

In [13]:
def get_classifier(training_input, number_of_clusters:int):
    dbscan = DBSCAN(eps=0.5, min_samples=number_of_clusters)
    dbscan.fit(training_input)
    return dbscan

def test_classifier(classifier:DBSCAN,validation_input, validation_output, label_names,positive_label):
    computedTestIndexes = classifier.fit_predict(validation_input)
    computed_outputs = [label_names[value] for value in computedTestIndexes]
    TP, TN, FP, FN = get_TP_TN_FP_FN(computed_outputs,validation_output,positive_label)
    accuracy = get_accuracy(TP, TN, FP, FN)
    precision = get_precision(TP, TN, FP, FN)
    recall = get_recall(TP, TN, FP, FN)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}")

dataframe = read_datas('reviews_mixed.csv')
training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(dataframe)
test_feats, validation_feats = get_bags_of_words(training_input,validation_input)
label_names = [name for name in set(training_output)]
classifier = get_classifier(test_feats,len(label_names))
test_classifier(classifier,validation_feats,validation_output,label_names,'positive')

Accuracy: 0.3225806451612903
Precision: 0.3111111111111111
Recall: 0.9655172413793104


## Agglomerative

In [14]:
from sklearn.cluster import AgglomerativeClustering

In [30]:
def get_classifier(training_input, number_of_clusters:int):
    agg_clustering = AgglomerativeClustering(n_clusters=number_of_clusters)
    agg_clustering.fit(training_input.toarray())
    return agg_clustering

def test_classifier(classifier:AgglomerativeClustering,validation_input, validation_output, label_names,positive_label):
    computedTestIndexes = classifier.fit_predict(validation_input.toarray())
    computed_outputs = [label_names[value] for value in computedTestIndexes]
    TP, TN, FP, FN = get_TP_TN_FP_FN(computed_outputs,validation_output,positive_label)
    accuracy = get_accuracy(TP, TN, FP, FN)
    precision = get_precision(TP, TN, FP, FN)
    recall = get_recall(TP, TN, FP, FN)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}")

dataframe = read_datas('reviews_mixed.csv')
training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(dataframe)
test_feats, validation_feats = get_bags_of_words(training_input,validation_input)
label_names = [name for name in set(training_output)]
classifier = get_classifier(test_feats,len(label_names))
test_classifier(classifier,validation_feats,validation_output,label_names,'positive')

Accuracy: 0.44565217391304346
Precision: 0.38666666666666666
Recall: 0.8529411764705882
