In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim import corpora

In [2]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})
df['label'] = df['label'].map(lambda x: newsgroups.target_names[x])
df


Unnamed: 0,text,label
0,\n\nI am sure some bashers of Pens fans are pr...,rec.sport.hockey
1,My brother is in the market for a high-perform...,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,comp.sys.mac.hardware
...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,sci.med
18842,\nNot in isolated ground recepticles (usually ...,sci.electronics
18843,I just installed a DX2-66 CPU in a clone mothe...,comp.sys.ibm.pc.hardware
18844,\nWouldn't this require a hyper-sphere. In 3-...,comp.graphics


In [3]:
def preprocessing(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^A-Za-z ]+', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word)>=3]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    processed_text = ' '.join(tokens)
    return processed_text

In [4]:
df['processed_text'] = df['text'].apply(lambda x: preprocessing(x))

In [5]:
df = df.drop('label',axis=1)
df

Unnamed: 0,text,processed_text
0,\n\nI am sure some bashers of Pens fans are pr...,sure bashers pen fan pretty confused lackof ki...
1,My brother is in the market for a high-perform...,brother market highperformance video card supp...
2,\n\n\n\n\tFinally you said what you dream abou...,finally said dream mediterranean newthe area g...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,thinkits scsi card dma transfer disksthe scsi ...
4,1) I have an old Jasmine drive which I cann...,old jasmine drive use new system understanding...
...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,nyedacnsvaxuwecedu david nyedn neurologydn con...
18842,\nNot in isolated ground recepticles (usually ...,isolated ground recepticles usually unusual co...
18843,I just installed a DX2-66 CPU in a clone mothe...,installed cpu clone motherboard tried mounting...
18844,\nWouldn't this require a hyper-sphere. In 3-...,wouldnt require hypersphere space point specif...


In [6]:
from gensim.models import Phrases

df['tokens'] = df['processed_text'].str.split(' ')
tokens = df['tokens'].tolist()
bigram = Phrases(tokens)
trigram = Phrases(bigram[tokens], min_count=1)
tokens = list(trigram[bigram[tokens]])
tokens

[['sure',
  'bashers',
  'pen_fan',
  'pretty',
  'confused',
  'lackof',
  'kind',
  'post',
  'recent',
  'pen',
  'massacre',
  'devil',
  'actuallyi',
  'bit_puzzled',
  'bit',
  'relieved',
  'however',
  'going',
  'put_endto',
  'nonpittsburghers',
  'relief',
  'bit',
  'praise',
  'pen',
  'man',
  'theyare',
  'killing',
  'devil',
  'worse',
  'thought',
  'jagr',
  'showed',
  'whyhe',
  'much_better',
  'regular_season_stats',
  'also',
  'lotfo',
  'fun_watch',
  'playoff',
  'bowman',
  'let',
  'jagr',
  'lot',
  'offun',
  'next_couple',
  'game',
  'since',
  'pen',
  'going_beat',
  'pulp',
  'jersey',
  'anyway',
  'disappointed',
  'see',
  'islander',
  'lose',
  'finalregular',
  'season',
  'game_pen',
  'rule'],
 ['brother',
  'market',
  'highperformance_video_card',
  'supportsvesa',
  'local_bus',
  'ram',
  'anyone',
  'suggestionsideas',
  'diamond_stealth_pro',
  'local_bus',
  'orchid',
  'farenheit',
  'ati_graphic_ultra_pro',
  'highperformance',
  'vl

In [7]:
dict_LDA = corpora.Dictionary(tokens)
dict_LDA.filter_extremes(no_below=3)
corpus = [dict_LDA.doc2bow(token) for token in tokens]


In [8]:
from gensim import models

lda = models.LdaModel(corpus, num_topics=20,
                      id2word=dict_LDA,
                      passes=4, alpha=[0.01] * 20,
                      eta=[0.01] * len(dict_LDA.keys()))


In [9]:
print(lda.print_topics(num_topics=20, num_words=20))

[(0, '0.055*"god" + 0.017*"jesus" + 0.016*"church" + 0.015*"one" + 0.011*"sin" + 0.011*"man" + 0.011*"lord" + 0.010*"christian" + 0.010*"say" + 0.010*"christ" + 0.009*"father" + 0.008*"life" + 0.008*"come" + 0.008*"said" + 0.008*"came" + 0.008*"also" + 0.007*"prayer" + 0.006*"would" + 0.006*"love" + 0.006*"bible"'), (1, '0.014*"get" + 0.012*"driver" + 0.009*"problem" + 0.009*"got" + 0.009*"like" + 0.008*"one" + 0.008*"really" + 0.007*"psalm" + 0.007*"think" + 0.007*"something" + 0.007*"home" + 0.007*"say" + 0.006*"know" + 0.006*"come" + 0.006*"going" + 0.005*"mouse" + 0.005*"well" + 0.005*"dont" + 0.005*"told" + 0.005*"little"'), (2, '0.022*"information" + 0.017*"list" + 0.013*"email" + 0.012*"book" + 0.011*"available" + 0.009*"mail" + 0.009*"address" + 0.009*"sale" + 0.009*"new" + 0.008*"send" + 0.008*"may" + 0.008*"group" + 0.007*"interested" + 0.007*"service" + 0.007*"offer" + 0.007*"conference" + 0.007*"name" + 0.007*"copy" + 0.007*"paper" + 0.006*"computer"'), (3, '0.014*"well" + 

In [10]:
# represent each document as a vector of topic weights and fill Nan with 0
doc_lda = lda[corpus]
doc_lda = [[topic[1] for topic in doc] for doc in doc_lda]
doc_lda = pd.DataFrame(doc_lda)
doc_lda.columns = ['topic_'+str(i) for i in range(20)]
doc_lda = doc_lda.fillna(0)
doc_lda = doc_lda[0:100]
dummy_data = doc_lda.to_csv('dummy_data.csv', index=False)
