In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer
from utils import read_text_file, cosine_distance

In [2]:
st = SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens")

In [3]:
imdb = pd.read_csv("imdb/imdb.csv").rename(columns={"sentiment": "polarity"}).assign(polarity=lambda df: df["polarity"] == "positive")
imdb

Unnamed: 0,review,polarity
0,One of the other reviewers has mentioned that ...,True
1,A wonderful little production. <br /><br />The...,True
2,I thought this was a wonderful way to spend ti...,True
3,Basically there's a family where a little boy ...,False
4,"Petter Mattei's ""Love in the Time of Money"" is...",True
...,...,...
49995,I thought this movie did a down right good job...,True
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",False
49997,I am a Catholic taught in parochial elementary...,False
49998,I'm going to have to disagree with the previou...,False


In [4]:
allo = pd.read_json("allocine/train_00.jsonl", lines=True)[["review", "polarity"]].astype({"polarity": "bool"}).sample(10000, random_state=42)
allo

Unnamed: 0,review,polarity
47044,c'est rare mais ca arrive cette suite est carr...,True
44295,"Avec ""Green Room"", Jeremy SAULNIER livre un su...",False
74783,"J'adore, comme vous le voyez le cinéma américa...",False
70975,définitivement un chef d'oeuvre du genre.,True
46645,"Avec ""La proie"" Eric Valette réussit un polar/...",True
...,...,...
58934,"Un merveilleux film, tout simplement. Une hist...",True
9155,Un des meilleurs films de son réalisateur! Con...,True
56757,Un film assez sombre qui est malheureusement a...,False
50663,"Un Aladin à la fois virevoltant et touchant, p...",False


In [None]:
# X_imdb = st.encode(imdb.review, batch_size=64, show_progress_bar=True)
# np.save("X_imdb.npy", X_imdb)
X_imdb = np.load("X_imdb.npy")

Batches:   0%|          | 0/782 [00:00<?, ?it/s]

In [None]:
# X_allo = st.encode(allo.review.values, batch_size=64, show_progress_bar=True)
# np.save("X_allo.npy", X_allo)
X_allo = np.load("X_allo.npy")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_imdb, imdb.polarity, test_size=0.3, shuffle=True)

In [9]:
clf = LogisticRegression().fit(X_train, y_train)

In [25]:
from sklearn.metrics import roc_auc_score

In [26]:
print("train acc:", roc_auc_score(y_train, clf.predict(X_train)))
print("test acc:", roc_auc_score(y_test, clf.predict(X_test)))
print("french acc:", roc_auc_score(allo.polarity, clf.predict(X_allo)))

train acc: 0.8312599252924923
test acc: 0.825928204480808
french acc: 0.831191299060785


In [15]:
embedding_names = [
   "distiluse-base-multilingual-cased-v2",
   "xlm-r-distilroberta-base-paraphrase-v1",
   "xlm-r-bert-base-nli-stsb-mean-tokens",
   "distilbert-multilingual-nli-stsb-quora-ranking",
]

In [14]:
imdb_sample = imdb.sample(5000)

In [19]:
# embs = [SentenceTransformer(name).encode(imdb_sample.review.tolist(), show_progress_bar=True, device="cpu") for name in embedding_names]