https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
https://www.machinelearningplus.com/nlp/gensim-tutorial/

In [1]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/d7/b9/6c93685bed0026b6a1cce55ab173f6b617f6db0d1325d25489c2fd43e711/gensim-3.7.1-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K    100% |████████████████████████████████| 24.2MB 15kB/s eta 0:00:011   68% |█████████████████████▉          | 16.5MB 3.7MB/s eta 0:00:03:01
Collecting smart-open>=1.7.0 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/c8/de7dcf34d4b5f2ae94fe1055e0d6418fb97a63c9dc3428edd264704983a2/smart_open-1.8.0.tar.gz (40kB)
[K    100% |████████████████████████████████| 40kB 8.1MB/s eta 0:00:01
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/nbuser/.cache/pip/wheels/f7/a6/ff/9ab5842c14e50e95a06a4675b0b4a689c9cab6064dac2b01d0
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.7.1 smart-open-1.8.0


In [2]:
import nltk
nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from gensim.models import LdaModel, LdaMulticore
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
from nltk.corpus import stopwords
import re
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'subject', 'lines', 'organization', 'would', 'article', 'could','also','many']


In [4]:
# Step 1: Import the dataset and get the text and real topic of each news article
dataset = api.load("text8")
data = [d for d in dataset]

2019-04-05 08:30:50,142 : INFO : Creating /home/nbuser/gensim-data




2019-04-05 08:31:07,817 : INFO : text8 downloaded


In [5]:
data[2]

['with',
 'the',
 'aegis',
 'of',
 'zeus',
 'when',
 'he',
 'goes',
 'to',
 'the',
 'battlefield',
 'the',
 'entire',
 'trojan',
 'army',
 'flees',
 'behind',
 'the',
 'walls',
 'of',
 'troy',
 'achilles',
 'wrath',
 'is',
 'terrible',
 'and',
 'he',
 'slays',
 'many',
 'trojan',
 'warriors',
 'and',
 'allies',
 'including',
 'priam',
 's',
 'son',
 'lycaon',
 'whom',
 'achilles',
 'had',
 'previously',
 'captured',
 'and',
 'sold',
 'into',
 'slavery',
 'but',
 'who',
 'had',
 'been',
 'returned',
 'to',
 'troy',
 'eventually',
 'hector',
 'comes',
 'out',
 'of',
 'the',
 'walls',
 'to',
 'defend',
 'the',
 'honour',
 'of',
 'troy',
 'he',
 'asked',
 'achilles',
 'to',
 'agree',
 'that',
 'the',
 'body',
 'of',
 'the',
 'loser',
 'would',
 'be',
 'returned',
 'for',
 'proper',
 'burial',
 'by',
 'the',
 'winner',
 'achilles',
 'rejected',
 'this',
 'arrangement',
 'saying',
 'though',
 'twenty',
 'ransoms',
 'and',
 'thy',
 'weight',
 'in',
 'gold',
 'were',
 'offered',
 'i',
 'would'

In [6]:
# Step 3: Create the Inputs of LDA model: Dictionary and Corpus
data_processed = []
data_processed = [['India','motherland','rich','heritage','history','rich','diverse','culture','landscapes'],
                    ['Celebrate','rich','Republic','day','january','independence','august']]
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

2019-04-05 08:33:48,873 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-05 08:33:48,874 : INFO : built Dictionary(14 unique tokens: ['India', 'culture', 'diverse', 'heritage', 'history']...) from 2 documents (total 16 corpus positions)


In [7]:
data_processed[0]

['India',
 'motherland',
 'rich',
 'heritage',
 'history',
 'rich',
 'diverse',
 'culture',
 'landscapes']

In [8]:
data_processed[1]

['Celebrate', 'rich', 'Republic', 'day', 'january', 'independence', 'august']

In [9]:
dct.token2id

{'India': 0,
 'culture': 1,
 'diverse': 2,
 'heritage': 3,
 'history': 4,
 'landscapes': 5,
 'motherland': 6,
 'rich': 7,
 'Celebrate': 8,
 'Republic': 9,
 'august': 10,
 'day': 11,
 'independence': 12,
 'january': 13}

In [10]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2)],
 [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]]

In [11]:
# Step 4: Train the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=2,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
# lda_model.save('lda_model.model')
# See the topics
lda_model.print_topics(-1)

2019-04-05 08:33:49,492 : INFO : using asymmetric alpha [0.63060194, 0.36939806]
2019-04-05 08:33:49,504 : INFO : using symmetric eta at 0.5
2019-04-05 08:33:49,510 : INFO : using serial LDA version on this node
2019-04-05 08:33:49,516 : INFO : running online LDA training, 2 topics, 10 passes over the supplied corpus of 2 documents, updating every 1000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2019-04-05 08:33:49,649 : INFO : training LDA model using 1 processes
2019-04-05 08:33:51,648 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2/2, outstanding queue size 1
2019-04-05 08:33:52,837 : INFO : topic #0 (0.631): 0.083*"rich" + 0.077*"independence" + 0.076*"january" + 0.074*"diverse" + 0.073*"Celebrate" + 0.072*"heritage" + 0.071*"culture" + 0.071*"landscapes" + 0.069*"Republic" + 0.069*"history"
2019-04-05 08:33:52,849 : INFO : topic #1 (0.369): 0.079*"january" + 0.078*"august" + 0.077*"motherland" + 0.076*"cultu

[(0,
  '0.132*"rich" + 0.069*"independence" + 0.069*"january" + 0.068*"diverse" + 0.067*"Celebrate" + 0.067*"heritage" + 0.067*"culture" + 0.067*"landscapes" + 0.066*"Republic" + 0.066*"history"'),
 (1,
  '0.076*"january" + 0.075*"august" + 0.075*"motherland" + 0.074*"culture" + 0.074*"landscapes" + 0.072*"India" + 0.072*"heritage" + 0.071*"diverse" + 0.070*"Celebrate" + 0.070*"Republic"')]

In [13]:
documents = ['Honda Cars India is currently offering some mouth-watering discount offers on the Brio, Jazz, WR-V, BR-V, City, and CR-V. With discounts ranging up to INR 1.5 lakhs on certain products, we feel its a good time for prospective car buyers to purchase a Honda model. Honda Cars Discounts The discounts on Honda cars will be valid until 7 November 2018. The offers include a cash discount, free insurance, and exchange bonus. Other than this, the company is running a contest that will give lucky buyers a free trip to London/Paris. Honda Brio',
            'The entry-level model in the companys line-up is available with first-year insurance, worth INR 25,000, at a nominal price of Re 1. Thanks to this, the effective starting price of the small car comes down to roughly INR 5.00 lakh.* With an average monthly sales figure of 229 units **, the Brio is among the least popular models in its segment. We feel that the company needs to offer more substantial discounts to boost the demand for its most affordable model. Honda Jazz',
            'The Maruti Baleno and the Hyundai Elite i20 rival from Honda Cars is available with an exchange bonus of INR 20,000. Additionally, the first-year insurance is being offered at a 50% discount. Furthermore, theres a corporate discount of INR 15000. This takes the total benefits to approximately INR 50000 bringing the effective starting price of the B2-segment hatchback to INR 7.75 lakh. With an average monthly sales of 1511 units, the Jazz is comprehensively outsold by the Maruti Baleno 18264 units. The latest discount offers should help the company boost the demand of its premium hatchback. Honda WR-V',
            'With average monthly sales of 3,052 units**, the Jazz-based crossover is twice as popular as its hatchback sibling. Currently, the WR-V is on sale with benefits worth more than INR 50,000. Buyers can benefit from an exchange bonus worth INR 20,000 on selling their old car to Honda Auto Terrace. Furthermore, they get free first-year insurance worth more than INR 30,000. This brings down the effecting starting price to INR 8.16 lakh*. Honda City',
             'With an average monthly sales of 3,531 units, the Honda City has been among the most successful models from the company. The C2-segment sedan is currently available with discounts worth INR 65,000. This brings the effective starting price to INR 9.07 lakh*. The discount offers include free first-year insurance worth almost INR 35,000 and a cash discount of INR 30,000. Honda BR-V',
             'Honda Cars India will launch the fifth generation CR-V on 9 October. Ahead of the launch of the new model, the company is offering a cash discount of INR 1.5 lakh on the existing version. The current generation Honda CR-V has a monthly sales average of just 19 units**. The next-gen model, thanks to an extra row of seats and a diesel engine option, should sell in higher numbers than the current iteration.',
             'Belgium hold on to top spot in the latest FIFA world football rankings released Thursday while England are on the rise, snatching fourth place from World Cup finalists Croatia. The top three positions are unchanged with World Cup winners France in second spot ahead of Brazil, in third. England move up one place on the strength of convincing Euro 2020 qualifier wins against the Czech Republic and Montenegro last month while Croatia, who reached the 2018 World Cup final before losing to France, retreat one place to fifth. Germany, rebuilding following their humiliating exit from the 2018 World Cup after the group phase, are also moving up. They rise three places to 13th on the back of their victory over the Netherlands in the Euro qualifiers last month. Austria slid 11 places, the biggest drop in the rankings, after Euro qualifier defeats by Poland and Israel in March.',
             'Record-breaking Germany face Azerbaijan on Sunday with head coach Joachim Loew insisting their simple goal is to "ruthlessly" secure World Cup qualification as quickly as possible. The defending champions are currently five points clear in Group C and a fifth straight win would take them a step closer to Russia 2018. After Wednesdays friendly home win over England, when Lukas Podolski ended his international career with the winning goal, Germany aim to be business-like in Baku. Even their plane on the flight from Duesseldorf was called Siegen To Win.',
             'Against England, Germany did not concede a goal for the seventh match running to set a new national record. The Germans have not leaked a goal since their 2-0 semi-final defeat to France at Euro 2016 last July. "Our goal is to secure a ticket for the World Cup as soon as possible," said Loew. "I am absolutely convinced that we will continue our winning run." Last October, Loew said he had two aims for the rest of the season: "that the team remains stable and we ruthlessly go through the qualifying phase". Both goals have so far been fulfilled with Germany winning all four qualifying games so far, scoring 16 goals and conceding none. Having fielded an experimental side against England, Loew will have first-choice starters Sami Khedira, Julian Draxler and Mario Gomez available after minor knocks.',
             'Germanys Nico Schulz sealed a thrilling 3-2 win over the Netherlands in a see-saw Euro 2020 qualifier as Joachim Loews new-look line-up survived a severe test on Sunday. Elsewhere, World Cup finalists Croatia stumbled to a 2-1 defeat in Hungary, and Eden Hazard celebrated a century of caps with the opening goal in Belgiums 2-0 win over Cyprus. A revamped Germany raced into a 2-0 lead in Amsterdam with thrilling strikes from Leroy Sane and Serge Gnabry but the hosts struck back with a second half header from Matthijs De Ligt before Memphis Depay pounced for an equaliser on 63 minutes. The four-time world champions were put under immense pressure from a Netherlands side seeking a winner but the visitors, who were booed by their own fans in a 1-1 friendly draw against Serbia last Wednesday, snatched victory in the 90th minute through 25-year-old Schulz. The Hoffenheim midfielder, who made his international in August, tapped home a loose ball after a tireless Sane had taken the defence with him on a run into the box.'
            ]
tokenized_list = [simple_preprocess(doc) for doc in documents]

In [14]:
tokenized_list

[['honda',
  'cars',
  'india',
  'is',
  'currently',
  'offering',
  'some',
  'mouth',
  'watering',
  'discount',
  'offers',
  'on',
  'the',
  'brio',
  'jazz',
  'wr',
  'br',
  'city',
  'and',
  'cr',
  'with',
  'discounts',
  'ranging',
  'up',
  'to',
  'inr',
  'lakhs',
  'on',
  'certain',
  'products',
  'we',
  'feel',
  'its',
  'good',
  'time',
  'for',
  'prospective',
  'car',
  'buyers',
  'to',
  'purchase',
  'honda',
  'model',
  'honda',
  'cars',
  'discounts',
  'the',
  'discounts',
  'on',
  'honda',
  'cars',
  'will',
  'be',
  'valid',
  'until',
  'november',
  'the',
  'offers',
  'include',
  'cash',
  'discount',
  'free',
  'insurance',
  'and',
  'exchange',
  'bonus',
  'other',
  'than',
  'this',
  'the',
  'company',
  'is',
  'running',
  'contest',
  'that',
  'will',
  'give',
  'lucky',
  'buyers',
  'free',
  'trip',
  'to',
  'london',
  'paris',
  'honda',
  'brio'],
 ['the',
  'entry',
  'level',
  'model',
  'in',
  'the',
  'companys

In [15]:
texts = [[text for text in doc.split()] for doc in documents]

In [17]:
texts

[['Honda',
  'Cars',
  'India',
  'is',
  'currently',
  'offering',
  'some',
  'mouth-watering',
  'discount',
  'offers',
  'on',
  'the',
  'Brio,',
  'Jazz,',
  'WR-V,',
  'BR-V,',
  'City,',
  'and',
  'CR-V.',
  'With',
  'discounts',
  'ranging',
  'up',
  'to',
  'INR',
  '1.5',
  'lakhs',
  'on',
  'certain',
  'products,',
  'we',
  'feel',
  'its',
  'a',
  'good',
  'time',
  'for',
  'prospective',
  'car',
  'buyers',
  'to',
  'purchase',
  'a',
  'Honda',
  'model.',
  'Honda',
  'Cars',
  'Discounts',
  'The',
  'discounts',
  'on',
  'Honda',
  'cars',
  'will',
  'be',
  'valid',
  'until',
  '7',
  'November',
  '2018.',
  'The',
  'offers',
  'include',
  'a',
  'cash',
  'discount,',
  'free',
  'insurance,',
  'and',
  'exchange',
  'bonus.',
  'Other',
  'than',
  'this,',
  'the',
  'company',
  'is',
  'running',
  'a',
  'contest',
  'that',
  'will',
  'give',
  'lucky',
  'buyers',
  'a',
  'free',
  'trip',
  'to',
  'London/Paris.',
  'Honda',
  'Brio'],

In [18]:
my_dictionary = corpora.Dictionary(texts)

2019-04-05 09:03:27,234 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-05 09:03:27,240 : INFO : built Dictionary(532 unique tokens: ['1.5', '2018.', '7', 'BR-V,', 'Brio']...) from 10 documents (total 1032 corpus positions)


In [24]:
dct = corpora.Dictionary(texts)
corpus = [dct.doc2bow(line) for line in texts]

2019-04-05 09:05:46,508 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-05 09:05:46,511 : INFO : built Dictionary(532 unique tokens: ['1.5', '2018.', '7', 'BR-V,', 'Brio']...) from 10 documents (total 1032 corpus positions)


In [25]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 5),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 1),
  (20, 5),
  (21, 2),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 2),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 3),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 2),
  (61, 1),
  (62, 1),
  (63, 3),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 2)],
 [(4, 1),
  (10, 1),
  (11, 2),
  (17, 1),
  (19, 1),
  (20, 1),
  (25, 1),
  (29, 1),
  (34, 1),
  (36, 1),
  (37, 1),
  (42, 1),
  (43, 2),
  (44, 2),
  (47, 1),
  (59, 1),
  (60, 7),
  (61, 1),
  (63, 4),
  (70, 1),
  (71, 1),
  (72, 1)

In [19]:
my_dictionary.token2id

{'1.5': 0,
 '2018.': 1,
 '7': 2,
 'BR-V,': 3,
 'Brio': 4,
 'Brio,': 5,
 'CR-V.': 6,
 'Cars': 7,
 'City,': 8,
 'Discounts': 9,
 'Honda': 10,
 'INR': 11,
 'India': 12,
 'Jazz,': 13,
 'London/Paris.': 14,
 'November': 15,
 'Other': 16,
 'The': 17,
 'WR-V,': 18,
 'With': 19,
 'a': 20,
 'and': 21,
 'be': 22,
 'bonus.': 23,
 'buyers': 24,
 'car': 25,
 'cars': 26,
 'cash': 27,
 'certain': 28,
 'company': 29,
 'contest': 30,
 'currently': 31,
 'discount': 32,
 'discount,': 33,
 'discounts': 34,
 'exchange': 35,
 'feel': 36,
 'for': 37,
 'free': 38,
 'give': 39,
 'good': 40,
 'include': 41,
 'insurance,': 42,
 'is': 43,
 'its': 44,
 'lakhs': 45,
 'lucky': 46,
 'model.': 47,
 'mouth-watering': 48,
 'offering': 49,
 'offers': 50,
 'on': 51,
 'products,': 52,
 'prospective': 53,
 'purchase': 54,
 'ranging': 55,
 'running': 56,
 'some': 57,
 'than': 58,
 'that': 59,
 'the': 60,
 'this,': 61,
 'time': 62,
 'to': 63,
 'trip': 64,
 'until': 65,
 'up': 66,
 'valid': 67,
 'we': 68,
 'will': 69,
 '**,': 

In [26]:
# Step 4: Train the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=2,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
# lda_model.save('lda_model.model')
# See the topics
# lda_model.print_topics(-1)

2019-04-05 09:07:55,931 : INFO : using asymmetric alpha [0.63060194, 0.36939806]
2019-04-05 09:07:55,933 : INFO : using symmetric eta at 0.5
2019-04-05 09:07:55,934 : INFO : using serial LDA version on this node
2019-04-05 09:07:55,938 : INFO : running online LDA training, 2 topics, 10 passes over the supplied corpus of 10 documents, updating every 1000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2019-04-05 09:07:55,940 : INFO : training LDA model using 1 processes
2019-04-05 09:07:56,339 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #10/10, outstanding queue size 1
2019-04-05 09:07:56,861 : INFO : topic #0 (0.631): 0.007*"the" + 0.005*"a" + 0.003*"to" + 0.003*"in" + 0.003*"with" + 0.003*"The" + 0.003*"of" + 0.003*"Honda" + 0.003*"on" + 0.003*"INR"
2019-04-05 09:07:56,864 : INFO : topic #1 (0.369): 0.006*"the" + 0.004*"to" + 0.004*"of" + 0.003*"in" + 0.003*"a" + 0.003*"on" + 0.003*"INR" + 0.003*"and" + 0.003*"is" 

In [28]:
# See the topics
lda_model.print_topics(-1)

2019-04-05 09:08:41,237 : INFO : topic #0 (0.631): 0.026*"the" + 0.023*"a" + 0.011*"to" + 0.009*"The" + 0.009*"and" + 0.008*"in" + 0.008*"with" + 0.008*"of" + 0.008*"Honda" + 0.008*"INR"
2019-04-05 09:08:41,243 : INFO : topic #1 (0.369): 0.030*"the" + 0.014*"of" + 0.013*"to" + 0.011*"in" + 0.010*"on" + 0.008*"INR" + 0.006*"is" + 0.006*"Honda" + 0.005*"World" + 0.005*"Cup"


[(0,
  '0.026*"the" + 0.023*"a" + 0.011*"to" + 0.009*"The" + 0.009*"and" + 0.008*"in" + 0.008*"with" + 0.008*"of" + 0.008*"Honda" + 0.008*"INR"'),
 (1,
  '0.030*"the" + 0.014*"of" + 0.013*"to" + 0.011*"in" + 0.010*"on" + 0.008*"INR" + 0.006*"is" + 0.006*"Honda" + 0.005*"World" + 0.005*"Cup"')]

In [29]:
texts[0][:10]

['Honda',
 'Cars',
 'India',
 'is',
 'currently',
 'offering',
 'some',
 'mouth-watering',
 'discount',
 'offers']

In [31]:
!pip install pattern

Collecting pattern
[?25l  Downloading https://files.pythonhosted.org/packages/1e/07/b0e61b6c818ed4b6145fe01d1c341223aa6cfbc3928538ad1f2b890924a3/Pattern-3.6.0.tar.gz (22.2MB)
[K    100% |████████████████████████████████| 22.3MB 20kB/s eta 0:00:011   18% |██████                          | 4.1MB 24.2MB/s eta 0:00:01    85% |███████████████████████████▎    | 19.0MB 2.9MB/s eta 0:00:02
Collecting backports.csv (from pattern)
  Downloading https://files.pythonhosted.org/packages/8e/26/a6bd68f13e0f38fbb643d6e497fc3462be83a0b6c4d43425c78bb51a7291/backports.csv-1.0.7-py2.py3-none-any.whl
Collecting mysqlclient (from pattern)
[?25l  Downloading https://files.pythonhosted.org/packages/f4/f1/3bb6f64ca7a429729413e6556b7ba5976df06019a5245a43d36032f1061e/mysqlclient-1.4.2.post1.tar.gz (85kB)
[K    100% |████████████████████████████████| 92kB 6.6MB/s eta 0:00:01
Collecting pdfminer.six (from pattern)
[?25l  Downloading https://files.pythonhosted.org/packages/8a/fd/6e8746e6965d1a7ea8e97253e3d79e6

In [34]:
# Step 2: Prepare Data (Remove stopwords and lemmatize)
data_processed = []

for i, doc in enumerate(texts[:100]):
    doc_out = []
    for wd in doc:
        if wd not in stop_words:  # remove stopwords
            lemmatized_word = lemmatize(wd, allowed_tags=re.compile('(NN|JJ|RB)'))  # lemmatize
            if lemmatized_word:
                doc_out = doc_out + [lemmatized_word[0].split(b'/')[0].decode('utf-8')]
        else:
            continue
    data_processed.append(doc_out)

# Print a small sample    
print(texts[0][:15]) 
print(data_processed[0][:15]) 

['Honda', 'Cars', 'India', 'is', 'currently', 'offering', 'some', 'mouth-watering', 'discount', 'offers', 'on', 'the', 'Brio,', 'Jazz,', 'WR-V,']
['honda', 'car', 'india', 'currently', 'mouth', 'discount', 'offer', 'brio', 'jazz', 'br', 'city', 'cr', 'discount', 'inr', 'lakhs']


In [35]:
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

2019-04-05 09:36:43,554 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-05 09:36:43,564 : INFO : built Dictionary(261 unique tokens: ['bonus', 'br', 'brio', 'buyer', 'car']...) from 10 documents (total 502 corpus positions)


In [36]:
2*5

10

In [37]:
# Step 4: Train the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=2,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
# lda_model.save('lda_model.model')
# See the topics
lda_model.print_topics(-1)

2019-04-05 09:36:58,498 : INFO : using asymmetric alpha [0.63060194, 0.36939806]
2019-04-05 09:36:58,499 : INFO : using symmetric eta at 0.5
2019-04-05 09:36:58,501 : INFO : using serial LDA version on this node
2019-04-05 09:36:58,504 : INFO : running online LDA training, 2 topics, 10 passes over the supplied corpus of 10 documents, updating every 1000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2019-04-05 09:36:58,506 : INFO : training LDA model using 1 processes
2019-04-05 09:37:00,504 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #10/10, outstanding queue size 1
2019-04-05 09:37:01,720 : INFO : topic #0 (0.631): 0.007*"goal" + 0.007*"world" + 0.006*"cup" + 0.005*"discount" + 0.005*"france" + 0.005*"germany" + 0.005*"euro" + 0.005*"qualifier" + 0.005*"place" + 0.005*"last"
2019-04-05 09:37:01,726 : INFO : topic #1 (0.369): 0.009*"inr" + 0.007*"honda" + 0.006*"discount" + 0.005*"model" + 0.005*"sale" + 0.005*"mo

[(0,
  '0.018*"world" + 0.016*"goal" + 0.015*"cup" + 0.013*"germany" + 0.011*"euro" + 0.011*"place" + 0.011*"last" + 0.011*"england" + 0.009*"qualifier" + 0.009*"loew"'),
 (1,
  '0.031*"inr" + 0.027*"honda" + 0.025*"discount" + 0.016*"car" + 0.015*"model" + 0.013*"sale" + 0.013*"worth" + 0.013*"unit" + 0.011*"monthly" + 0.011*"insurance"')]

In [40]:
doc_lda = lda_model[corpus]

In [44]:
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

2019-04-05 09:46:37,414 : INFO : topic #0 (0.631): 0.018*"world" + 0.016*"goal" + 0.015*"cup" + 0.013*"germany" + 0.011*"euro" + 0.011*"place" + 0.011*"last" + 0.011*"england" + 0.009*"qualifier" + 0.009*"loew"
2019-04-05 09:46:37,422 : INFO : topic #1 (0.369): 0.031*"inr" + 0.027*"honda" + 0.025*"discount" + 0.016*"car" + 0.015*"model" + 0.013*"sale" + 0.013*"worth" + 0.013*"unit" + 0.011*"monthly" + 0.011*"insurance"


[(0,
  '0.018*"world" + 0.016*"goal" + 0.015*"cup" + 0.013*"germany" + 0.011*"euro" '
  '+ 0.011*"place" + 0.011*"last" + 0.011*"england" + 0.009*"qualifier" + '
  '0.009*"loew"'),
 (1,
  '0.031*"inr" + 0.027*"honda" + 0.025*"discount" + 0.016*"car" + '
  '0.015*"model" + 0.013*"sale" + 0.013*"worth" + 0.013*"unit" + '
  '0.011*"monthly" + 0.011*"insurance"')]


In [49]:
# Human readable format of corpus (term-frequency)
[[(dct[id], freq) for id, freq in cp] for cp in corpus[:5]]

[[('bonus', 1),
  ('br', 1),
  ('brio', 2),
  ('buyer', 2),
  ('car', 4),
  ('cash', 1),
  ('certain', 1),
  ('city', 1),
  ('company', 1),
  ('contest', 1),
  ('cr', 1),
  ('currently', 1),
  ('discount', 5),
  ('exchange', 1),
  ('feel', 1),
  ('free', 2),
  ('good', 1),
  ('honda', 5),
  ('india', 1),
  ('inr', 1),
  ('insurance', 1),
  ('jazz', 1),
  ('lakhs', 1),
  ('london', 1),
  ('lucky', 1),
  ('model', 1),
  ('mouth', 1),
  ('november', 1),
  ('offer', 2),
  ('other', 1),
  ('product', 1),
  ('prospective', 1),
  ('purchase', 1),
  ('time', 1),
  ('trip', 1),
  ('valid', 1)],
 [('brio', 1),
  ('car', 1),
  ('company', 1),
  ('discount', 1),
  ('feel', 1),
  ('honda', 1),
  ('inr', 2),
  ('insurance', 1),
  ('jazz', 1),
  ('model', 3),
  ('affordable', 1),
  ('available', 1),
  ('average', 1),
  ('boost', 1),
  ('companys', 1),
  ('demand', 1),
  ('effective', 1),
  ('entry', 1),
  ('figure', 1),
  ('first', 1),
  ('lakh', 1),
  ('least', 1),
  ('line', 1),
  ('monthly', 1),
 

In [53]:
dct.id2token

{0: 'bonus',
 1: 'br',
 2: 'brio',
 3: 'buyer',
 4: 'car',
 5: 'cash',
 6: 'certain',
 7: 'city',
 8: 'company',
 9: 'contest',
 10: 'cr',
 11: 'currently',
 12: 'discount',
 13: 'exchange',
 14: 'feel',
 15: 'free',
 16: 'good',
 17: 'honda',
 18: 'india',
 19: 'inr',
 20: 'insurance',
 21: 'jazz',
 22: 'lakhs',
 23: 'london',
 24: 'lucky',
 25: 'model',
 26: 'mouth',
 27: 'november',
 28: 'offer',
 29: 'other',
 30: 'product',
 31: 'prospective',
 32: 'purchase',
 33: 'time',
 34: 'trip',
 35: 'valid',
 36: 'affordable',
 37: 'available',
 38: 'average',
 39: 'boost',
 40: 'companys',
 41: 'demand',
 42: 'effective',
 43: 'entry',
 44: 'figure',
 45: 'first',
 46: 'lakh',
 47: 'least',
 48: 'line',
 49: 'monthly',
 50: 'need',
 51: 'nominal',
 52: 'popular',
 53: 'price',
 54: 're',
 55: 'roughly',
 56: 'sale',
 57: 'segment',
 58: 'small',
 59: 'substantial',
 60: 'thank',
 61: 'unit',
 62: 'worth',
 63: 'additionally',
 64: 'approximately',
 65: 'baleno',
 66: 'benefit',
 67: 'comp

In [54]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 2),
 (4, 4),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 5),
 (13, 1),
 (14, 1),
 (15, 2),
 (16, 1),
 (17, 5),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 2),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1)]

In [52]:
len(corpus)

10

In [51]:
for c in lda_model[corpus[1:10]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print("------------------------------------------------------\n")

Document Topics      :  [(0, 0.01672776), (1, 0.98327225)]
Word id, Topics      :  [(2, [1]), (4, [1]), (8, [1])]
Phi Values (word id) :  [(2, [(1, 0.9984582)]), (4, [(1, 0.9993173)])]
Word, Topics         :  [('brio', [1]), ('car', [1])]
Phi Values (word)    :  [('brio', [(1, 0.9984582)]), ('car', [(1, 0.9993173)])]
------------------------------------------------------

Document Topics      :  [(0, 0.014220186), (1, 0.9857798)]
Word id, Topics      :  [(0, [1]), (4, [1]), (8, [1])]
Phi Values (word id) :  [(0, [(1, 0.9989654)]), (4, [(1, 0.9994255)])]
Word, Topics         :  [('bonus', [1]), ('car', [1])]
Phi Values (word)    :  [('bonus', [(1, 0.9989654)]), ('car', [(1, 0.9994255)])]
------------------------------------------------------

Document Topics      :  [(0, 0.017522905), (1, 0.9824771)]
Word id, Topics      :  [(0, [1]), (3, [1]), (4, [1])]
Phi Values (word id) :  [(0, [(1, 0.9987466)]), (3, [(1, 0.9985628)])]
Word, Topics         :  [('bonus', [1]), ('buyer', [1])]
Phi Va

In [59]:
# data_processed[0]
new_data_processed = [['germany','look','strong','coach','world','cup','campaign','favorites','beat','england']]
# new_data_processed[0]
new_corpus = [dct.doc2bow(line) for line in new_data_processed]

In [61]:
for c in lda_model[new_corpus[0:10]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print("------------------------------------------------------\n")

Document Topics      :  [(0, 0.93701017), (1, 0.06298983)]
Word id, Topics      :  [(113, [0]), (117, [0]), (126, [0])]
Phi Values (word id) :  [(113, [(0, 0.99905527)]), (117, [(0, 0.99880415)])]
Word, Topics         :  [('cup', [0]), ('england', [0])]
Phi Values (word)    :  [('cup', [(0, 0.99905527)]), ('england', [(0, 0.99880415)])]
------------------------------------------------------

