In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
import torch
print("GPU доступний:", torch.cuda.is_available())
print("Назва GPU:", torch.cuda.get_device_name(0))

GPU доступний: True
Назва GPU: Quadro P2000


In [3]:
np.random.seed(2025)

In [4]:
positive_examples = list(open("data/rt-polarity.pos", "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open("data/rt-polarity.neg", "r").readlines())
negative_examples = [s.strip() for s in negative_examples]

In [5]:
df = pd.DataFrame({'text': negative_examples + positive_examples,
              'lable': [0]*len(negative_examples) + [1]*len(positive_examples)})
df = df.reset_index(drop=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['lable'], stratify=df['lable'])


<div align="center">

# **FAISS**

</div>



In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')


train_embeddings = model.encode(X_train.tolist(), normalize_embeddings=True, show_progress_bar=True)
test_embeddings = model.encode(X_test.tolist(), normalize_embeddings=True, show_progress_bar=True)
faiss.normalize_L2(train_embeddings)
faiss.normalize_L2(test_embeddings)

Batches:   0%|          | 0/250 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Batches:   0%|          | 0/84 [00:00<?, ?it/s]

In [8]:
print(len(X_train), len(train_embeddings))

7996 7996


In [9]:
d = train_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(np.array(train_embeddings).astype('float32'))
print(f"Number of vectors in the index: {index.ntotal}")

Number of vectors in the index: 7996


In [10]:
k = 5

distances, indices = index.search(np.array(test_embeddings).astype('float32'), k)


In [11]:
predictions = []
for neighbor_idxs in indices:
    neighbor_labels = [y_train.tolist()[i] for i in neighbor_idxs]
    pred = max(set(neighbor_labels), key=neighbor_labels.count)
    predictions.append(pred)


In [12]:
metrics_df = pd.DataFrame(columns=['Method', 'Accuracy', 'Precision', 'Recall', 'F1'])

In [13]:
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions, average='binary')
rec = recall_score(y_test, predictions, average='binary')
f1 = f1_score(y_test, predictions, average='binary')

new_row = pd.DataFrame([{
    'Method': 'FAISS',
    'Accuracy': acc,
    'Precision': prec,
    'Recall': rec,
    'F1': f1
}])

metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

display(metrics_df)

  metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)


Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,FAISS,0.6988,0.671189,0.779445,0.721277



<div align="center">

# **BART Zero-shot**

</div>

In [14]:
from transformers import pipeline

In [15]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning,
                        module="torch.utils.data")

In [16]:
zero_shot_classifier = pipeline("zero-shot-classification",
                                model="facebook/bart-large-mnli")

candidate_labels = ["positive", "negative"]
all_results = zero_shot_classifier(
    X_test.tolist(),
    candidate_labels,
    multi_label=False
)

preds = [1 if res['labels'][0] == 'positive' else 0 for res in all_results]

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

new_row = pd.DataFrame([{
    'Method': 'BART Zero-shot',
    'Accuracy': acc,
    'Precision': prec,
    'Recall': rec,
    'F1': f1
}])

metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

display(metrics_df)

for review, res in zip(X_test.iloc[:5], all_results[:5]):
    print("-"*50)
    print(f"Review: {review[:100]}...")
    print(
        f"Predicted: {res['labels'][0]} with confidence {res['scores'][0]:.4f}")

Device set to use cuda:0


Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,FAISS,0.6988,0.671189,0.779445,0.721277
1,BART Zero-shot,0.810953,0.872417,0.728432,0.793949


--------------------------------------------------
Review: the title helpfully offers the most succinct review of it you'll read anywhere ....
Predicted: positive with confidence 0.8786
--------------------------------------------------
Review: even on those rare occasions when the narrator stops yammering , miller's hand often feels unsure ....
Predicted: negative with confidence 0.9184
--------------------------------------------------
Review: a really funny fifteen-minute short stretched beyond its limits to fill an almost feature-length fil...
Predicted: positive with confidence 0.9608
--------------------------------------------------
Review: equlibrium could pass for a thirteen-year-old's book report on the totalitarian themes of 1984 and f...
Predicted: negative with confidence 0.9778
--------------------------------------------------
Review: the filmmakers try to balance pointed , often incisive satire and unabashed sweetness , with results...
Predicted: positive with confidenc


<div align="center">

# **TF-IDF + Logistic Regression**

</div>

In [17]:
tfidf_vectorizer = TfidfVectorizer(min_df=5)
X_train_tfidf_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf_vectorized = tfidf_vectorizer.transform(X_test)

print('Number of features = {:,}'.format(
    len(tfidf_vectorizer.get_feature_names_out())))
print('Shape of X_train_vectorized:', X_train_tfidf_vectorized.shape)

Number of features = 3,635
Shape of X_train_vectorized: (7996, 3635)


In [18]:
sorted_tfidf_index = X_train_tfidf_vectorized.max(axis=0).toarray()[0].argsort()
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

print('Smallest tfidf:\n', feature_names[sorted_tfidf_index[:10]])
print('Largest tfidf:\n', feature_names[sorted_tfidf_index[-10:]])

Smallest tfidf:
 ['cia' 'private' 'willis' 'ponder' 'specific' 'pulling' 'carefully'
 'homes' 'cost' 'came']
Largest tfidf:
 ['reality' 'silly' 'tasty' 'calculated' 'retro' 'satisfying' 'out'
 'disappointment' 'shallow' 'cinematic']


In [19]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf_vectorized, y_train)

predictions = clf.predict(X_test_tfidf_vectorized)

In [20]:
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

new_row = pd.DataFrame([{
    'Method': 'TF-IDF + Logistic Regression',
    'Accuracy': acc,
    'Precision': prec,
    'Recall': rec,
    'F1': f1
}])

metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

display(metrics_df)

Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,FAISS,0.6988,0.671189,0.779445,0.721277
1,BART Zero-shot,0.810953,0.872417,0.728432,0.793949
2,TF-IDF + Logistic Regression,0.764816,0.759178,0.775694,0.767347


BART Zero-shot showed the best results in terms of accuracy and F1-score. This means that it is best at identifying whether a review is positive or negative. However, this method was the slowest because the model is large and processing texts takes a long time.

TF-IDF with Logistic Regression showed slightly lower accuracy, but it works very quickly, making it convenient for processing large numbers of reviews.

FAISS had the lowest accuracy, although it is also quite fast. It can be used if you need a quick search for similar texts rather than maximum classification accuracy.

Overall, if accuracy is a priority, BART is the best choice. If speed is important, TF-IDF with Logistic Regression is the best option.
