In [21]:
import pandas as pd
import numpy as np

df = pd.read_csv('review_mixed.csv')

data_size = df.shape[0]
indexes = [i for i in range(data_size)]

training_index = np.random.choice(indexes, int(data_size * 0.8))
validation_index = [i for i in range(data_size) if not i in training_index]

training_input = [df['Text'].iloc[index] for index in training_index]
training_output = [df['Sentiment'].iloc[index] for index in training_index]

validation_input = [df['Text'].iloc[index] for index in validation_index]
validation_output = [df['Sentiment'].iloc[index] for index in validation_index]

### Extragere caracteristici

#### Bag of Words

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

def get_Bag_of_Words(training_input, validation_input):

    vectorizer = CountVectorizer()
    train_features = vectorizer.fit_transform(training_input)
    validation_features = vectorizer.transform(validation_input)

    # print(train_features)
    # print(validation_features)

    return train_features, validation_features

### TF-IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_TF_IDF(training_input, validation_input):
    
    vectorizer = TfidfVectorizer()

    train_features = vectorizer.fit_transform(training_input)
    validation_features = vectorizer.transform(validation_input)

    print(train_features.toarray())
    print(validation_features.toarray())

    return train_features, validation_features

### Doc2Vec

In [24]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def get_Doc2Vec(training_input, validation_input):

    tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(training_input)]

    model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, workers=4)

    train_features = [model.infer_vector(doc.split()) for doc in training_input]
    validation_features = [model.infer_vector(doc.split()) for doc in validation_input]

    return train_features, validation_features

## kMeans

In [25]:
from sklearn.cluster import KMeans

In [26]:
train_features, validation_features = get_Bag_of_Words(training_input, validation_input)

unsupervisedClassifier = KMeans(n_clusters=2)
unsupervisedClassifier.fit(train_features)

In [27]:
labelNames = ['negative', 'positive']

computedTestIndexes = unsupervisedClassifier.predict(validation_features)
print(computedTestIndexes)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]
for i in range(0, len(validation_input)):
    print(validation_input[i], " -> ", computedTestOutputs[i])

[1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1
 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1]
someone must have been smoking in the room next door.  ->  positive
Very spacious rooms, quiet and very comfortable.  ->  positive
Air conditioning working fine.  ->  positive
So if you're the type that likes to let water run a bit before getting wet or it takes a minute to figure out how to make it hot, you're gonna get wet.  ->  positive
the windows are only single glazed so the heat could escape- although to be fair it was -6 outside!  ->  negative
Terrible, small cubbyholes, which are marketed as rooms.  ->  positive
Corridors filthy
Room filthy
Electrical cables in room not safe
Whole building smelly
Shower repulsive  ->  positive
Shows some wear and tear.  ->  positive
Microwave needed!  ->  positive
Room wasn't cleaned or bed made up  ->  positive
The heat in the room fluctuated -- at t

In [28]:
from sklearn.metrics import accuracy_score

print("acc: ", accuracy_score(validation_output, computedTestOutputs))

acc:  0.4166666666666667


### Predictie mesaj:  
    -   By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement..

In [29]:
text = 'By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement..'

train_features, validation_features = get_Bag_of_Words(training_input, [text])

computedTestIndexes = unsupervisedClassifier.predict(validation_features)
print(computedTestIndexes)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]
print(computedTestOutputs)

[1]
['positive']


### Alternative la kMeans

#### DBSCAN

In [30]:
from sklearn.cluster import DBSCAN

train_features, validation_features = get_Bag_of_Words(training_input, validation_input)

model = DBSCAN(eps=0.5, min_samples=5)
clusters = model.fit(train_features)

computedTestIndexes = unsupervisedClassifier.predict(validation_features)
# print(computedTestIndexes)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]

# print(clusters)
print("acc: ", accuracy_score(validation_output, computedTestOutputs))

acc:  0.4166666666666667
