In [55]:
import pandas as pd
import numpy as np
from numpy.random import choice
from math import sqrt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from collections import Counter

In [91]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Ensure necessary NLTK resources are downloaded
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Load the dataset
df = pd.read_csv('reviews_mixed.csv')

# Define lemmatization function
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = text.lower().split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized_words)

# Apply lemmatization to the dataset
df['Text_Lemmatized'] = df['Text'].apply(lemmatize_text)

# Split the dataset into training and validation sets
training_input, validation_input, training_output, validation_output = train_test_split(df['Text_Lemmatized'], df['Sentiment'], test_size=0.2, random_state=42)

# Bag of Words
vectorizer = CountVectorizer()

# Transform the text data to feature vectors
training_embeddings = vectorizer.fit_transform(training_input).toarray()
validation_embeddings = vectorizer.transform(validation_input).toarray()

# Example text for prediction
text = ["By choosing a bike over a car, I'm reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I'm proud to be part of that movement."]
text_lemmatized = [lemmatize_text(t) for t in text]
text_embedding = vectorizer.transform(text_lemmatized).toarray()

print("Training Embeddings Shape:", training_embeddings.shape)
print("Validation Embeddings Shape:", validation_embeddings.shape)
print("Text Embedding Shape:", text_embedding.shape)


Training Embeddings Shape: (165, 467)
Validation Embeddings Shape: (42, 467)
Text Embedding Shape: (1, 467)


In [82]:
# pip install transformers
# pip install torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

# Ensure necessary NLTK resources are downloaded
# nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('reviews_mixed.csv')

# Define function to preprocess text (lemmatization + stop words removal)
def preprocess_text(text):
    lemmatizer = nltk.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = text.lower().split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized_words)

# Apply preprocessing to the dataset
df['Text_Preprocessed'] = df['Text'].apply(preprocess_text)

# Split the dataset into training and validation sets
training_input, validation_input, training_output, validation_output = train_test_split(
    df['Text_Preprocessed'], df['Sentiment'], test_size=0.2, random_state=42)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define function to get BERT embeddings
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()

# Get BERT embeddings for training and validation sets
training_embeddings = get_bert_embeddings(training_input.tolist())
validation_embeddings = get_bert_embeddings(validation_input.tolist())

# Example text for prediction
example_text = ["By choosing a bike over a car, I'm reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I'm proud to be part of that movement."]
example_embeddings = get_bert_embeddings([preprocess_text(t) for t in example_text])

print("Training Embeddings Shape:", training_embeddings.shape)
print("Validation Embeddings Shape:", validation_embeddings.shape)
print("Example Embedding Shape:", example_embeddings.shape)



Training Embeddings Shape: (165, 768)
Validation Embeddings Shape: (42, 768)
Example Embedding Shape: (1, 768)


In [97]:
df = pd.read_csv('reviews_mixed.csv')

def get_training_and_validation_datas(df: pd.DataFrame, training_size = 0.8):
    data_size = df.shape[0]
    indexes = [i for i in range(data_size)]
    training_index = np.random.choice(indexes,int(data_size*training_size))
    validation_index = [i for i in range(data_size) if not i in training_index]
    training_input = [df['Text'].iloc[index] for index in training_index]
    training_output = [df['Sentiment'].iloc[index] for index in training_index]
    validation_input = [df['Text'].iloc[index] for index in validation_index]
    validation_output = [df['Sentiment'].iloc[index] for index in validation_index]
    return training_input, training_output, validation_input, validation_output

training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(df)

#Bag of Words
vectorizer = CountVectorizer()

#TF-IDF
# vectorizer = TfidfVectorizer(max_features=50)

text = ["By choosing a bike over a car, I'm reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I'm proud to be part of that movement."]
training_embeddings = vectorizer.fit_transform(training_input).toarray()
validation_embeddings = vectorizer.transform(validation_input).toarray()


print("Training Embeddings Shape:", training_embeddings.shape)
print("Validation Embeddings Shape:", validation_embeddings.shape)


Training Embeddings Shape: (165, 446)
Validation Embeddings Shape: (98, 446)


## KMEANS

In [99]:
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(training_embeddings)

label_names = [name for name in set(training_output)]
validation_indexes = kmeans.predict(validation_embeddings)
computed_outputs = [label_names[value] for value in validation_indexes]

accuracy = accuracy_score(validation_output, computed_outputs)
print(f'Accuracy: {accuracy*100}%')

Accuracy: 64.28571428571429%


In [100]:
text = ["By choosing a bike over a car, I'm reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I'm proud to be part of that movement."]
input =  vectorizer.transform(text).toarray()
label = kmeans.predict(input)
print('Label:', label_names[label[0]])

Label: negative


In [81]:
from sklearn.cluster import AgglomerativeClustering

agglomerative = AgglomerativeClustering(n_clusters=2)
agglomerative.fit(training_embeddings)

label_names = [name for name in set(training_output)]
validation_indexes = agglomerative.fit_predict(validation_embeddings)
computed_outputs = [label_names[value] for value in validation_indexes]

accuracy = accuracy_score(validation_output, computed_outputs)
print(f'Accuracy: {accuracy * 100}%')

Accuracy: 64.8936170212766%


In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=2)
gmm.fit(training_embeddings)

label_names = [name for name in set(training_output)]
validation_indexes = gmm.predict(validation_embeddings)
computed_outputs = [label_names[value] for value in validation_indexes]

accuracy = accuracy_score(validation_output, computed_outputs)
print(f'Accuracy: {accuracy * 100}%')

Accuracy: 71.42857142857143%
