In [1]:
from transformers import AutoModel
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

text1 = "Belleville is home to many of Madrid's most famous artists and musicians. With an art guide, walk the tourist-friendly streets to see murals and other art on the sides of buildings."
text2 = "The Orsay Museum is one of Paris’s most popular museums. Enjoy skip-the-line entrance and follow your guide on a tour of the museum's extensive exhibitions. See masterpieces like Monet's ‘Blue Water Lilies,’ Cézanne’S ‘Apples and Oranges’ and Renoir’’"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
embeddings = model.encode([text1, text2])
print(cos_sim(embeddings[0], embeddings[1]))

tensor([[0.8036]])


In [3]:
model = SentenceTransformer('thenlper/gte-large')
embeddings = model.encode([text1, text2])
print(cos_sim(embeddings[0], embeddings[1]))

tensor([[0.8191]])


In [4]:
embeddings.shape

(2, 1024)

In [1]:
import pandas as pd
combined_embeddings = pd.read_pickle("../tmp/product_textual_lang_summarized.pickle")


In [2]:
combined_embeddings

Unnamed: 0,PRODUCTCODE,pdt_inclexcl_ENG_CONTENT,pdt_product_detail_PRODUCTDESCRIPTION_translated,pdt_product_detail_PRODUCTDESCRIPTION_SUMMARIZED
0,100123P1,,Our adventure starts off in La Cerdanya valley...,This route through Catalonia will be accompani...
1,100123P2,,"In La Cerdanya, one of the largest valleys in ...",We offer you a week of active holidays and swi...
2,100123P3,,We have planned this trip for those who love c...,"The route runs through La Garrotxa, the famous..."
3,100123P4,,"In La Cerdanya, one of the largest valleys in ...",We offer you a week of active holidays and swi...
4,100123P5,,"In La Cerdanya, one of the largest valleys in ...",We offer you a week of active holidays and swi...
...,...,...,...,...
21257,9973P33,,Get Ready to Sparkle in Paris: Your New Year'...,Paris NYE Bar Crawl is the ultimate way to wel...
21258,9973P35,VIP Entrance to the Club,Get ready for the ultimate party experience i...,Party Bus Paris France will take you on a nigh...
21259,9973P4,Skip the line; Visit 3/4 bars Clubs ( 4 Venues...,Riviera Bar Crawl Paris presents the ultimate ...,The Paris pub crawl Latin Quarter is great fun...
21260,9974P1,The Ice Bar Experience Admission Ticket ; One ...,Experience the first ice bar at the beach in t...,Experience the first ice bar at the beach in t...


In [1]:
#!/usr/bin/env python
# coding: utf-8

import gc

import pandas as pd
from deep_translator import GoogleTranslator
from langdetect import detect
from tqdm import tqdm

gc.collect()


def detect_language(text):
    try:
        return detect(str(text))
    except Exception:
        return ""


def translate_text(text):
    try:
        return GoogleTranslator(source="auto", target="en").translate(text)
    except Exception as e:
        print(f"Translation error: {e}")
        return text



In [3]:
df = pd.read_pickle("../tmp/product_textual.pickle")

In [4]:
df["pdt_inclexcl_ENG_CONTENT_lang"] = [
    detect_language(el) for el in tqdm(df["pdt_inclexcl_ENG_CONTENT"])
]

100%|██████████| 21261/21261 [00:41<00:00, 517.56it/s] 


In [7]:
df[~df["pdt_inclexcl_ENG_CONTENT_lang"].isin(["en", ""])].to_csv("inclexcl_not_eng.csv")