In [None]:
!pip install fasttext gensim



In [5]:
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(
    repo_id="clarin-pl/fastText-kgr10",
    filename="kgr10.plain.cbow.dim100.neg10.bin"
)
repo = "clarin-pl/word2vec-kgr10"
basename = "skipgram.v300.m8.ns.mwe.w2v.gensim"

model_path2 = hf_hub_download(repo_id=repo, filename=basename)
vectors_path = hf_hub_download(repo_id=repo, filename=basename + ".vectors.npy")

In [11]:
import fasttext

model = fasttext.load_model(model_path)

In [9]:
from gensim.models import Word2Vec, KeyedVectors
model2 = KeyedVectors.load(model_path2, mmap='r')
print("Model loaded:", model2)

Model loaded: KeyedVectors<vector_size=300, 2283377 keys>


In [12]:
word = "samoch√≥d"
vector = model.get_word_vector(word)

print(vector.shape)
print(vector[:10])

(100,)
[-1.7518617   2.1546729   1.6832509  -0.25992498 -0.12307668 -2.2615018
  1.4332038  -1.0710886  -1.8803575   1.0211588 ]


In [10]:
print(model2['samoch√≥d'][:10])

[-0.041509 -0.029741  0.035125  0.004014 -0.034046 -0.032715 -0.051992
 -0.01177   0.050827  0.078601]


In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.manifold import TSNE
import plotly.express as px
from collections import defaultdict, Counter

df = pd.read_csv("./tweets.csv", sep=';')

assert {"tweet", "annotation"}.issubset(df.columns), "Dataset must have tweet and annotation columns"
df['annotation'] = df['annotation'].astype(float)

def get_embedding(text, model):
    words = text.lower().split()
    vecs = []
    for w in words:
        try:
            v = model[w] if hasattr(model, "wv") is False else model.wv[w]
            vecs.append(v)
        except KeyError:
            continue
    if not vecs:
        return np.zeros(model.vector_size if hasattr(model, "vector_size") else model.get_dimension())
    return np.mean(vecs, axis=0)

word_ann = defaultdict(list)
word_counts = Counter()
for tweet, ann in zip(df['tweet'].astype(str), df['annotation']):
    toks = tweet.lower().split()
    for t in toks:
        word_ann[t].append(ann)
        word_counts[t] += 1

vocab = list(word_ann.keys())
print("Unique tokens:", len(vocab))

def ft_vector(w):
    try:
        return model.get_word_vector(w)
    except Exception:
        # fallback
        return np.zeros(model.get_dimension() if hasattr(model, "get_dimension") else model.vector_size)

def w2v_vector(w):
    try:
        if hasattr(model2, "wv"):
            return model2.wv[w]
        else:
            return model2[w]
    except Exception:
        return None

words = []
ft_vecs = []
w2v_vecs = []
avg_ann = []
counts = []

for w in tqdm(vocab, desc="Collecting per-word vectors"):
    words.append(w)
    counts.append(word_counts[w])
    avg_ann.append(float(np.mean(word_ann[w])))
    ft_vecs.append(ft_vector(w))
    w2v_vecs.append(w2v_vector(w))

ft_vecs = np.vstack(ft_vecs)  # (V, D)


Unique tokens: 654


Collecting per-word vectors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 654/654 [00:00<00:00, 33202.30it/s]


In [28]:
ft_missing = np.sum([np.allclose(v, 0) for v in ft_vecs])
w2v_missing = np.sum([v is None for v in w2v_vecs])

print(f"fastText: {ft_missing} / {len(ft_vecs)} words missing ({ft_missing/len(ft_vecs)*100:.2f}%)")
print(f"Word2Vec: {w2v_missing} / {len(w2v_vecs)} words missing ({w2v_missing/len(w2v_vecs)*100:.2f}%)")

fastText: 0 / 654 words missing (0.00%)
Word2Vec: 309 / 654 words missing (47.25%)


In [19]:
# prepare Word2Vec present subset
w2v_present_mask = np.array([v is not None for v in w2v_vecs])
if w2v_present_mask.any():
    w2v_present = np.vstack([v for v in w2v_vecs if v is not None])
else:
    w2v_present = np.empty((0, model2.vector_size if hasattr(model2, "vector_size") else model2.get_dimension()))

# 2 wizualizacje
print("Running t-SNE for fastText (words)...")
tsne_fast = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
tsne_fast_2d = tsne_fast.fit_transform(ft_vecs)

df_words = pd.DataFrame({
    "word": words,
    "avg_annotation": avg_ann,
    "count": counts,
    "tsne_fast_x": tsne_fast_2d[:,0],
    "tsne_fast_y": tsne_fast_2d[:,1],
    "w2v_present": w2v_present_mask
})

fig_fast = px.scatter(
    df_words,
    x="tsne_fast_x",
    y="tsne_fast_y",
    color="avg_annotation",
    hover_data=["word", "avg_annotation", "count"],
    title="t-SNE of WORDS (FastText) ‚Äî color=avg_annotation (0..1)"
)
fig_fast.update_traces(marker=dict(size=6))
fig_fast.show()

# Word2Vec t-SNE only for words present in model2
if w2v_present.shape[0] > 0:
    print("Running t-SNE for Word2Vec (words present in model2)...")
    tsne_w2v = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
    tsne_w2v_2d = tsne_w2v.fit_transform(w2v_present)

    w2v_indices = [i for i, present in enumerate(w2v_present_mask) if present]
    df_w2v = pd.DataFrame({
        "word": [words[i] for i in w2v_indices],
        "avg_annotation": [avg_ann[i] for i in w2v_indices],
        "count": [counts[i] for i in w2v_indices],
        "tsne_w2v_x": tsne_w2v_2d[:,0],
        "tsne_w2v_y": tsne_w2v_2d[:,1],
    })

    fig_w2v = px.scatter(
        df_w2v,
        x="tsne_w2v_x",
        y="tsne_w2v_y",
        color="avg_annotation",
        hover_data=["word", "avg_annotation", "count"],
        title="t-SNE of WORDS (Word2Vec) ‚Äî color=avg_annotation (0..1)"
    )
    fig_w2v.update_traces(marker=dict(size=6))
    fig_w2v.show()
else:
    print("No words from corpus were present in Word2Vec model2 ‚Äî skipping Word2Vec t-SNE plot.")

Running t-SNE for fastText (words)...



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



Running t-SNE for Word2Vec (words present in model2)...



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



In [27]:
import random

def top_similar(word, model_obj, k=10):
    try:
        # FastText (Facebook library)
        if hasattr(model_obj, "get_nearest_neighbors"):
            return [(w, float(score)) for score, w in model_obj.get_nearest_neighbors(word, k)]
        # Gensim KeyedVectors / Word2Vec
        elif hasattr(model_obj, "most_similar"):
            return model_obj.most_similar(word, topn=k)
        elif hasattr(model_obj, "wv"):
            return model_obj.wv.most_similar(word, topn=k)
        else:
            return []
    except KeyError:
        return []

neg_words = set(" ".join(df.loc[df["annotation"] == 1, "tweet"].astype(str)).lower().split())
annotated_words = list(neg_words)
random_indexes = random.sample(range(len(annotated_words)), 10)
annotated_words = [annotated_words[i] for i in random_indexes]
sample_words = annotated_words[:10]

print("\nüîç Comparing k-most similar words (FastText vs Word2Vec):")
for w in sample_words:
    sim1 = top_similar(w, model)   # FastText
    sim2 = top_similar(w, model2)  # Word2Vec
    print(f"\nWord: {w}")
    if sim1:
        print("FastText:", [s[0] for s in sim1])
    else:
        print("FastText: No similar words found")
    if sim2:
        print("Word2Vec:", [s[0] for s in sim2])
    else:
        print("Word2Vec: No similar words found")


üîç Comparing k-most similar words (FastText vs Word2Vec):

Word: ko≈Ñ
FastText: ['rumak', 'konik', 'kozunak', 'kunak', 'konioptak', 'wilk', 'kozio≈Ç', 'dzik', 'noro≈ºec', 'baw√≥≈Ç']
Word2Vec: ['wierzchowiec', 'rumak', 'klacz', 'gniadosz', 'wa≈Çach', 'je≈∫dziec', 'w√≥z', 'karosz', 'osiod≈Çaƒá', 'siod≈Ço']

Word: daje
FastText: ['Daje', 'dadaje', 'dajes', 'dajemo', 'dajƒÖ\x82√Ç', 'dadzi', 'dajet', 'dajeƒá', 'pdaje', 'dajel']
Word2Vec: ['odczuwam', 'szukam', 'kryje', 'ukochanej', 'kochamy', 'boli', 'lubi', 'przera≈ºa', 'dostrzegamy', 'dumy']

Word: mam
FastText: ['Qmam', '1.mam', 'pMam', '¬≥am', 'qmam', 'nmam', '\x84Mam', 'p.s.mam', "mamy'", 'ƒòMam']
Word2Vec: No similar words found

Word: 5
FastText: ['6', '7', '8', '4', '3', '10', '2', '9', '1', '30']
Word2Vec: ['4', '3', '6', '2', '8', '10', '7', '9', '12', '15']

Word: ty.
FastText: ['ty≈Å', 'ty√Ç', 'ty\xad', 'tyA', 'tyM', 'ty2', 'ty√Ö', 'tyS', 'ty1', 'tyv']
Word2Vec: No similar words found

Word: ≈ºe
FastText: ['i≈º', 'i≈ºz', '≈ºe

In [26]:
neg_words

{'5',
 '?',
 '@anonymized_account',
 'a',
 'ale',
 'banda',
 'bandy',
 'bany.',
 'bardziej',
 'by',
 'bƒôdzie',
 'ciekawe',
 'ciekawy',
 'co',
 'czarny',
 'czego?',
 'czy',
 'daje',
 'dlatego',
 'do',
 'dor√≥wnam',
 'dupie',
 'du≈ºƒÖ',
 'dyskutujƒô',
 'dzisiaj',
 'dziƒôki',
 'euromendy',
 'generalnie,',
 'gratulujƒô',
 'g≈Çowƒô.',
 'g≈Çupocie.',
 'hipokryt√≥w',
 'i',
 'idƒÖ',
 'im',
 'innymi',
 'inteligentny',
 'jak',
 'jego',
 'jest',
 'jestem',
 'kadencja',
 'karczewski',
 'kogo?,',
 'korzysta≈Ç',
 'ko≈Ñ',
 'krystynƒÖ',
 'kwadratu,',
 'ma',
 'malkontenci',
 'mam',
 'media?',
 'medycznego.',
 'mentalno≈õciowo',
 'mi',
 'miƒôdzy',
 'mot≈Çochem',
 'murzyn',
 'muszƒÖ',
 'na',
 'nad',
 'nie',
 'niechƒôƒá',
 'nigdy',
 'nikt',
 'ni≈º',
 'no',
 'objƒÖƒá',
 'obowiƒÖzek.',
 'obro≈Ñcy.',
 'obywatelstwa',
 'oglƒÖdaƒáüòÅüòÄüòÄ',
 'pani',
 'perfidnego',
 'pisdzielskie',
 'pisowskim',
 'podobnie',
 'polak',
 'polski',
 'polskiego',
 'powrotu',
 'powstanie.',
 'pozbawiƒá',
 'po≈Çowa',
 'propagand

In [23]:
top_similar("murzyn", model2), top_similar("murzyn", model)

([('Murzyn', 0.7257713675498962),
  ('ne#Karolina_CYTAT', 0.6520460247993469),
  ('fuckme', 0.649269700050354),
  ('anarchol', 0.6492606997489929),
  ('narodowiec68', 0.6337197422981262),
  ('sloniu', 0.6273672580718994),
  ('veleda', 0.6270846128463745),
  ('cobryn', 0.6238105297088623),
  ('nitrobolonus', 0.623781144618988),
  ('kerio', 0.6231412887573242)],
 [('murzynek', 0.8260194659233093),
  ('czarnuch', 0.7775088548660278),
  ('ufoludek', 0.751975417137146),
  ('dzikusek', 0.750576913356781),
  ('chojar', 0.7500784993171692),
  ('czarnuchow', 0.748458206653595),
  ('chur', 0.742967426776886),
  ('chamon', 0.7424283027648926),
  ('krasnoludek', 0.7305992841720581),
  ('baran', 0.7302419543266296)])