In [1]:
from gensim import corpora, models
import numpy as np
import spacy
nlp = spacy.load("pl_core_news_sm")

In [2]:
corp = [
    "Samochód z napedem na cztery koła",
    "Jan kupił nowy samochód",
    "Adrian kupił nowy motor",
    "Jest ciepło i nie pada",
    "Już dawno nie było zimno",
    "Jutro ma padać i być zimno"
]

In [3]:
doc_corp = list(nlp.pipe(corp, disable=["ner"]))

In [4]:
doc_corp

[Samochód z napedem na cztery koła,
 Jan kupił nowy samochód,
 Adrian kupił nowy motor,
 Jest ciepło i nie pada,
 Już dawno nie było zimno,
 Jutro ma padać i być zimno]

In [5]:
norm_corp = [[token.lemma_ for token in doc] for doc in doc_corp]

In [6]:
norm_corp

[['samochód', 'z', 'napedem', 'na', 'cztery', 'kół'],
 ['Jany', 'kupić', 'nowy', 'samochód'],
 ['Adriana', 'kupić', 'nowy', 'motor'],
 ['być', 'ciepło', 'i', 'nie', 'padać'],
 ['już', 'dawno', 'nie', 'być', 'zimno'],
 ['jutro', 'mieć', 'padać', 'i', 'być', 'zimno']]

In [7]:
dict1 = corpora.Dictionary()
dict1.add_documents(norm_corp)

In [8]:
dict1.token2id

{'cztery': 0,
 'kół': 1,
 'na': 2,
 'napedem': 3,
 'samochód': 4,
 'z': 5,
 'Jany': 6,
 'kupić': 7,
 'nowy': 8,
 'Adriana': 9,
 'motor': 10,
 'być': 11,
 'ciepło': 12,
 'i': 13,
 'nie': 14,
 'padać': 15,
 'dawno': 16,
 'już': 17,
 'zimno': 18,
 'jutro': 19,
 'mieć': 20}

In [9]:
bow_corp = [dict1.doc2bow(doc) for doc in norm_corp]; bow_corp

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(4, 1), (6, 1), (7, 1), (8, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)],
 [(11, 1), (14, 1), (16, 1), (17, 1), (18, 1)],
 [(11, 1), (13, 1), (15, 1), (18, 1), (19, 1), (20, 1)]]

In [10]:
tfidf = models.TfidfModel(bow_corp)
tfidf_corp = tfidf[bow_corp]; list(tfidf_corp)

[[(0, 0.43129300109336677),
  (1, 0.43129300109336677),
  (2, 0.43129300109336677),
  (3, 0.43129300109336677),
  (4, 0.26444609287978765),
  (5, 0.43129300109336677)],
 [(4, 0.4203338818488622),
  (6, 0.6855350343414989),
  (7, 0.4203338818488622),
  (8, 0.4203338818488622)],
 [(7, 0.3696140760810379),
  (8, 0.3696140760810379),
  (9, 0.602814594019389),
  (10, 0.602814594019389)],
 [(11, 0.25633990727938405),
  (12, 0.6626290477555459),
  (13, 0.4062891404761619),
  (14, 0.4062891404761619),
  (15, 0.4062891404761619)],
 [(11, 0.22710694740248566),
  (14, 0.35995599528619177),
  (16, 0.5870629426886774),
  (17, 0.5870629426886774),
  (18, 0.35995599528619177)],
 [(11, 0.21368506455142705),
  (13, 0.3386828142781915),
  (15, 0.3386828142781915),
  (18, 0.3386828142781915),
  (19, 0.5523678788296186),
  (20, 0.5523678788296186)]]

In [11]:
 lsi = models.LsiModel(bow_corp, id2word=dict1, num_topics=2)

In [12]:
for index, topic in lsi.show_topics(2):
    print("Index", index)
    print(topic)
    print("-" * 50)

Index 0
-0.541*"być" + -0.389*"padać" + -0.389*"i" + -0.359*"zimno" + -0.334*"nie" + -0.207*"jutro" + -0.207*"mieć" + -0.181*"ciepło" + -0.152*"dawno" + -0.152*"już"
--------------------------------------------------
Index 1
-0.494*"samochód" + -0.368*"kupić" + -0.368*"nowy" + -0.280*"kół" + -0.280*"cztery" + -0.280*"na" + -0.280*"z" + -0.280*"napedem" + -0.213*"Jany" + -0.155*"motor"
--------------------------------------------------


In [13]:
lsi = models.LsiModel(tfidf_corp, id2word=dict1, num_topics=2)

In [14]:
for index, topic in lsi.show_topics(2):
    print("Index", index)
    print(topic)
    print("-" * 50)

Index 0
0.379*"padać" + 0.379*"i" + 0.354*"nie" + 0.342*"ciepło" + 0.330*"być" + 0.313*"zimno" + 0.276*"mieć" + 0.276*"jutro" + 0.235*"już" + 0.235*"dawno"
--------------------------------------------------
Index 1
0.471*"nowy" + 0.471*"kupić" + 0.420*"Jany" + 0.348*"Adriana" + 0.348*"motor" + 0.312*"samochód" + 0.089*"napedem" + 0.089*"z" + 0.089*"cztery" + 0.089*"na"
--------------------------------------------------


In [15]:
for i in range(len(tfidf_corp)):
    print(lsi[tfidf_corp[i]])

[(1, 0.27467631341167215)]
[(1, 0.8154772433076705)]
[(1, 0.7678255382593432)]
[(0, 0.7635066840190263)]
[(0, 0.5906874800527531)]
[(0, 0.7386157639023291)]


In [16]:
lsi.projection.u

array([[ 1.22876939e-16,  8.90717392e-02],
       [-2.27796465e-17,  8.90717392e-02],
       [ 7.08853371e-17,  8.90717392e-02],
       [-3.41297217e-18,  8.90717392e-02],
       [ 3.58775324e-16,  3.12336718e-01],
       [ 9.11904718e-18,  8.90717392e-02],
       [ 3.96910521e-16,  4.20327508e-01],
       [ 6.42976912e-16,  4.71104428e-01],
       [ 6.13277924e-16,  4.71104428e-01],
       [ 6.81159772e-16,  3.48010721e-01],
       [ 7.51585300e-16,  3.48010721e-01],
       [ 3.30103680e-01, -5.27069902e-16],
       [ 3.42438860e-01, -7.05206727e-16],
       [ 3.79286674e-01, -5.54213801e-16],
       [ 3.53880706e-01, -7.00115802e-16],
       [ 3.79286674e-01, -5.49644028e-16],
       [ 2.34715712e-01, -3.60947679e-16],
       [ 2.34715712e-01, -3.58880524e-16],
       [ 3.13236528e-01, -3.21833871e-16],
       [ 2.76151062e-01, -1.41165768e-16],
       [ 2.76151062e-01, -1.87075627e-16]])

In [17]:
lsi.projection.s

array([1.21548649, 1.15325898])

In [18]:
new_doc = nlp("Wczoraj mocno padało")
new_doc = [token.lemma_ for token in new_doc]
new_bow = dict1.doc2bow(new_doc)
new_tfidf = tfidf[new_bow]

In [19]:
lsi[new_tfidf]

[(0, 0.37928667414787054)]

In [20]:
lsi.add_documents([new_tfidf]); lsi[new_tfidf]

[(0, 0.689056561421721)]

In [21]:
list(lsi[tfidf_corp])

[[(1, 0.2746763134116721)],
 [(1, 0.8154772433076705)],
 [(1, 0.7678255382593432)],
 [(0, 0.757262494024756)],
 [(0, 0.46264487550352157)],
 [(0, 0.7112663204744019)]]

In [22]:
lsi.save("lsi.model")

In [23]:
load_lsi = models.LsiModel.load("lsi.model")