In [None]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd

newsgroups_train = fetch_20newsgroups(subset='train',data_home = '/content/gdrive/My Drive/20NewsGroup',remove=('headers','footers','quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',data_home='/content/gdrive/My Drive/20NewsGroup',remove=('headers','footers','quotes'))

df_train = pd.DataFrame({'content':np.array(newsgroups_train.data),'target':np.array(newsgroups_train.target_names)[newsgroups_train.target]})
df_test = pd.DataFrame({'content':np.array(newsgroups_test.data),'target':np.array(newsgroups_test.target_names)[newsgroups_test.target]})

df_test.to_csv('/content/gdrive/My Drive/20NewsGroup/test.csv',index=False)
df_train.to_csv('/content/gdrive/My Drive/20NewsGroup/train.csv',index=False)

In [3]:
import re
import gensim
import pandas as pd
import numpy as np
DATA_PATH = '/content/gdrive/My Drive/20NewsGroup/'
from google.colab import drive
drive.mount('/content/gdrive') 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
df_train = pd.read_csv(DATA_PATH+ 'train.csv')
df_test = pd.read_csv(DATA_PATH +  'test.csv')

In [None]:
print(df_train.nunique(),df_test.nunique())

content    10993
target        20
dtype: int64 content    7310
target       20
dtype: int64


In [None]:
df = pd.concat([df_train,df_test],ignore_index=True)
df.dropna(inplace=True)
print(df.nunique())

content    18286
target        20
dtype: int64


In [None]:
import re
import gensim
from gensim.parsing.preprocessing import STOPWORDS
import spacy
model = spacy.load('en')

def preprocess(text,blacklisted_pos=['PRON']):
  text = re.sub('\S*@\S*\s?', '', text) #remove emails
  text = re.sub('\s+', ' ', text) #remove new line chars
  text = re.sub("\'", "", text) #remove single quotes
  tokens = gensim.utils.simple_preprocess(text, deacc=True) #tokenise , ignores tokens with len<2 and >15
  tokens = [token.lower() for token in tokens] #lowercase
  spacy_doc = model(" ".join(tokens)) #lemmatization1
  tokens =[token.lemma_ for token in spacy_doc if token.pos_ not in blacklisted_pos and token.lemma_.islower()] #lemmatization2
  tokens =[token for token in tokens if token not in STOPWORDS] #remove stopwords
  return tokens

df['preprocessed']=df['content'].apply(preprocess)

In [None]:
df.head()

Unnamed: 0,content,target,preprocessed
0,I was wondering if anyone out there could enli...,rec.autos,"[wonder, enlighten, car, day, door, sport, car..."
1,A fair number of brave souls who upgraded thei...,comp.sys.mac.hardware,"[fair, number, brave, soul, upgrade, si, clock..."
2,"well folks, my mac plus finally gave up the gh...",comp.sys.mac.hardware,"[folk, mac, plus, finally, ghost, weekend, sta..."
3,\nDo you have Weitek's address/phone number? ...,comp.graphics,"[weitek, address, phone, number, like, informa..."
4,"From article <C5owCB.n3p@world.std.com>, by to...",sci.space,"[article, tom, baker, understanding, expect, e..."


In [None]:
df.to_csv(DATA_PATH+'preprocessed_data.csv',index=False)

In [4]:
df=pd.read_csv(DATA_PATH+'preprocessed_data.csv')
df['preprocessed'] = df['preprocessed'].apply(lambda cell: list(map(lambda x: x.replace("'",""),cell[1:-1].split(','))))


In [30]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [181]:
import itertools
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

def get_topic_model(data,params):
  corpus,id2word=data
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20,**params)
  return lda_model

def get_coherence_score(data,model):
  text,id2word=data
  coherence_model =CoherenceModel(model=model, texts=text, dictionary=id2word, coherence='c_v',topn=4)
  return coherence_model.get_coherence()
 
param_grid = {
    'chunksize': [700,900,1200],
    'update_every':[0,1], #batch learning==0
    'alpha':['auto'],
    'eta':['auto'],
    'decay':[0.5,0.6,0.7,0.8],
    'offset':[1.0],
    'eval_every':[10],
    'iterations':[50],
    'random_state':[100],
    'per_word_topics':[False],
    'keep_n':[15000,20000,25000],
    'no_above':[0.5,0.6,0.7]
}

grid_result = {
    'param_values' : [],
    'coherence' : [],
    'perplexity':[],
}

def construct_params(keys,values):
  dic={}
  for k,v in zip(keys,values):
    dic[k]=v
  return dic

best_coherent_lda_model=None
best_perplexity_lda_model=None
best_coherence = 0
best_perplexity = 0

In [None]:
for params in itertools.product(*param_grid.values()):
  id2word=Dictionary(df['preprocessed'])
  #filter irrelevant words
  id2word.filter_extremes(no_below=3,no_above=params[-1],keep_n=params[-2])
  print('Built dictionary:: keep_n',params[-2],'no_above',params[-1])
  corpus = df['preprocessed'].map(id2word.doc2bow)
  print('Built corpus')

  grid_result['param_values'].append(params)
  params_dic=construct_params(list(param_grid.keys())[:-2],params[:-2])
  data = (corpus,id2word)
  data_coherence = (df['preprocessed'],id2word)

  print(params_dic)
  lda_model=get_topic_model(data,params_dic)
  current_perplexity = lda_model.log_perplexity(corpus)
  current_coherence = get_coherence_score(data_coherence,lda_model)

  grid_result['perplexity'].append(current_perplexity)
  grid_result['coherence'] .append( current_coherence)

  if current_coherence > best_coherence:
    best_coherence=current_coherence
    best_coherent_lda_model=lda_model

  if current_perplexity < best_perplexity:
    best_perplexity = current_perplexity
    best_perplexity_lda_model = lda_model

  print('Coherence',current_coherence)

In [None]:
import pickle
with open(DATA_PATH+'top5_metrics_tune2.pickle','wb') as F:
  pickle.dump(grid_result,F)

In [9]:
import pickle
with open(DATA_PATH + 'top5_metrics_tune2.pickle','rb') as F:
  grid_result = pickle.load(F)

In [117]:
df_exp = { } 
for key in list(param_grid.keys()):
  df_exp[key]=[]
df_exp['perplexity']=[]
df_exp['coherence']=[]
for params,coherence,perplexity in zip(grid_result['param_values'],grid_result['coherence'],grid_result['perplexity']):
  for key,val in zip(list(param_grid.keys()),params):
    df_exp[key].append(val)
  df_exp['coherence'].append(coherence)
  df_exp['perplexity'].append(perplexity)
  

df_exp = pd.DataFrame(df_exp)
df_exp.sort_values('coherence',ascending=False).head()

Unnamed: 0,chunksize,update_every,alpha,eta,decay,offset,eval_every,iterations,random_state,per_word_topics,keep_n,no_above,perplexity,coherence
57,700,1,auto,auto,0.7,1.0,10,50,100,False,20000,0.5,-7.858002,0.713669
58,700,1,auto,auto,0.7,1.0,10,50,100,False,20000,0.6,-7.858002,0.713669
59,700,1,auto,auto,0.7,1.0,10,50,100,False,20000,0.7,-7.858002,0.713669
49,700,1,auto,auto,0.6,1.0,10,50,100,False,20000,0.6,-7.863675,0.705598
48,700,1,auto,auto,0.6,1.0,10,50,100,False,20000,0.5,-7.863675,0.705598


In [151]:
corpus = None
id2word = None
for id,row in df_exp.sort_values('coherence',ascending=False).iterrows():
  dc = {}
  for key,val in zip(list(param_grid.keys())[:-2],row[:-4]):
    dc[key]=val
  dc['per_word_topics']=True
  #Additional params
  dc['passes']=5

  print(dc)
  id2word=Dictionary(df['preprocessed'])

  # # filter irrelevant tokens
  # bad_tokens = ['s','know','think','like']
  # bad_ids = [id2word.token2id[tok] for tok in bad_tokens ]
  # id2word.filter_tokens(bad_ids=bad_ids)
  
  #filter irrelevant words
  print(row)
  id2word.filter_extremes(no_below=3,no_above=row[-3],keep_n=row[-4])
  print('Built dictionary:: keep_n',row[-4],'no_above',row[-3])
  corpus = df['preprocessed'].map(id2word.doc2bow)
  print('Built corpus')
  lda_model = get_topic_model((corpus,id2word),dc)
  break

{'chunksize': 700, 'update_every': 1, 'alpha': 'auto', 'eta': 'auto', 'decay': 0.7, 'offset': 1.0, 'eval_every': 10, 'iterations': 50, 'random_state': 100, 'per_word_topics': True, 'passes': 5}
chunksize               700
update_every              1
alpha                  auto
eta                    auto
decay                   0.7
offset                    1
eval_every               10
iterations               50
random_state            100
per_word_topics       False
keep_n                20000
no_above                0.5
perplexity           -7.858
coherence          0.713669
Name: 57, dtype: object
Built dictionary:: keep_n 20000 no_above 0.5
Built corpus


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

In [157]:
def format_topics_sentences(ldamodel,texts,id2word):
    # Init output
    sent_topics_df = pd.DataFrame()
    
    # Get main topic in each document
    for text in texts:
      text_bow = id2word.doc2bow(text)
      tp = ldamodel.get_document_topics(bow = text_bow)
      tp = sorted(tp, key=lambda x: x[1], reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
      for j, (topic_num, prop_topic) in enumerate(tp):
        if j == 0:  # => dominant topic
            wp = ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
        else:
            break
    sent_topics_df.columns = ['Dominant_Topic', 'Dominant topic score', 'Topic_Keywords']

    # Add original text to the end of the output
    sent_topics_df = pd.concat([sent_topics_df, texts], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model,texts = df['preprocessed'],id2word = id2word)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.3829,"car, bike, engine, drive, ride, new, cl...","[wonder, enlighten, car, day, door, sport..."
1,1,4.0,0.5322,"use, window, drive, run, work, card, do...","[fair, number, brave, soul, upgrade, si, ..."
2,2,0.0,0.4185,"game, year, team, good, play, think, wi...","[folk, mac, plus, finally, ghost, weekend..."
3,3,19.0,0.5129,"government, key, use, chip, encryption, ...","[weitek, address, phone, number, like, in..."
4,4,4.0,0.3166,"use, window, drive, run, work, card, do...","[article, tom, baker, understanding, expec..."
5,5,10.0,0.8234,"think, people, know, question, point, me...","[course, term, rigidly, define, doubt, us..."
6,6,16.0,0.2556,"book, list, post, send, mail, copy, art...","[people, respond, request, info, treatment..."
7,7,4.0,0.8298,"use, window, drive, run, work, card, do...","[know, scsi, scsi, scsi, controler, chip,..."
8,8,11.0,0.6341,"file, program, use, ftp, edu, version, ...","[win, download, icon, bmp, figure, change..."
9,9,4.0,0.3419,"use, window, drive, run, work, card, do...","[board, year, work, diskdoubler, autodoubl..."


In [158]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords.groupby('Dominant_Topic').agg('count')
topic_counts.reset_index(level=0,inplace=True)
topic_counts = topic_counts[['Dominant_Topic','Topic_Keywords']].rename(columns= {'Dominant_Topic':'Topic','Topic_Keywords':'Doc count'})
def get_keyword(topic):
  topic = int(topic)
  wp = lda_model.show_topic(topic)
  topic_keywords = ", ".join([word for word, prop in wp])
  return topic_keywords
topic_counts['Keywords']=topic_counts['Topic'].apply(get_keyword)
topic_counts.sort_values('Doc count',ascending=False)

Unnamed: 0,Topic,Doc count,Keywords
4,4.0,3485,"use, window, drive, run, work, card, do..."
10,10.0,2373,"think, people, know, question, point, me..."
15,15.0,2241,"like, know, time, gun, think, day, peop..."
0,0.0,1642,"game, year, team, good, play, think, wi..."
12,12.0,1056,"use, key, power, bit, line, number, hig..."
9,9.0,1018,"god, jesus, church, christian, bible, be..."
13,13.0,866,"israel, war, jews, government, armenian, ..."
11,11.0,840,"file, program, use, ftp, edu, version, ..."
1,1.0,784,"car, bike, engine, drive, ride, new, cl..."
16,16.0,777,"book, list, post, send, mail, copy, art..."


In [134]:
for topic in df['target'].unique():
  print(topic)

rec.autos
comp.sys.mac.hardware
comp.graphics
sci.space
talk.politics.guns
sci.med
comp.sys.ibm.pc.hardware
comp.os.ms-windows.misc
rec.motorcycles
talk.religion.misc
misc.forsale
alt.atheism
sci.electronics
comp.windows.x
rec.sport.hockey
rec.sport.baseball
soc.religion.christian
talk.politics.mideast
talk.politics.misc
sci.crypt


In [148]:
for topic in lda_model.top_topics(texts=df['preprocessed']):
  for prob,word in topic[0]:
    print(word,'->',np.round(prob,4))
  print('---------------------------------------------')

 like -> 0.0244
 know -> 0.0241
 think -> 0.0169
 time -> 0.0149
 thing -> 0.014
 good -> 0.0133
 want -> 0.0127
 s -> 0.0117
 try -> 0.0105
 look -> 0.01
 problem -> 0.0098
 come -> 0.0092
 hear -> 0.0083
 start -> 0.0078
 work -> 0.0072
 way -> 0.0067
 right -> 0.0066
 lot -> 0.0066
 tell -> 0.0066
 sure -> 0.0065
---------------------------------------------
 child -> 0.016
 come -> 0.0146
 man -> 0.0139
 know -> 0.0118
 people -> 0.0096
 tell -> 0.0095
 woman -> 0.0083
 life -> 0.0072
 live -> 0.0063
 die -> 0.0063
 time -> 0.0063
 leave -> 0.0058
 s -> 0.0058
 ask -> 0.0057
 kill -> 0.0057
 day -> 0.0057
 start -> 0.0055
 death -> 0.005
 think -> 0.0049
 look -> 0.0045
---------------------------------------------
 people -> 0.0167
 think -> 0.0147
 believe -> 0.0115
 point -> 0.0096
 question -> 0.0091
 mean -> 0.0083
 know -> 0.0079
 way -> 0.0079
 thing -> 0.007
 claim -> 0.0069
 fact -> 0.0065
 like -> 0.0061
 evidence -> 0.0061
 right -> 0.0059
 reason -> 0.0059
 true -> 0.00

In [182]:
get_coherence_score((df['preprocessed'],id2word),lda_model)

0.7415194714068376

In [180]:
for topic in df['target'].unique():
  print('Topic:',topic)
  dom_top = df_dominant_topic.iloc[df[df['target']==topic].index][['Dominant_Topic','Text']].groupby('Dominant_Topic').agg('count').sort_values('Text',ascending=False)
  dom_top.reset_index(level=0,inplace=True)
  dom_top = dom_top.rename(columns= {'Dominant_Topic':'Topic ID','Text':'Count'})
  dom_top = dom_top.astype({'Topic ID':'int64'})
  dom_top['kw'] = dom_top['Topic ID'].apply(get_keyword)
  print(dom_top.head(3))
  print('========================================================')

Topic: rec.autos
   Topic ID  Count                                                 kw
0         1    359   car,  bike,  engine,  drive,  ride,  new,  cl...
1        15    187   like,  know,  time,  gun,  think,  day,  peop...
2        12     94   use,  key,  power,  bit,  line,  number,  hig...
Topic: comp.sys.mac.hardware
   Topic ID  Count                                                 kw
0         4    711   use,  window,  drive,  run,  work,  card,  do...
1        12     48   use,  key,  power,  bit,  line,  number,  hig...
2        15     40   like,  know,  time,  gun,  think,  day,  peop...
Topic: comp.graphics
   Topic ID  Count                                                 kw
0         4    298   use,  window,  drive,  run,  work,  card,  do...
1        11    215   file,  program,  use,  ftp,  edu,  version,  ...
2        14    126   image,  color,  jpeg,  software,  graphic,  p...
Topic: sci.space
   Topic ID  Count                                                 kw
0     

In [165]:
df[df['target']=='sci.crypt'].iloc[1]['content']

"I sent a response to the White House at\n\n\t0005895485@MCIMAIL.COM (White House)\n\nand received a nice, automatic reply from MICMAIL noting, in passing, that\nif I had included a SNail address, I would get a reply in due course.\n\nFor those who care, my reply was:\n\n\t1.\tyes, let's protect the voice network\n\n\t2.\tprivately-developed crypto has always been available and\n\t\talways will be -- so let's think about how to do law\n\t\tenforcement given that fact not about how to hope to\n\t\tlegislate against it\n\n\t3.\tmy needs for crypto as a system designer are not met by the\n\t\tClipper Chip.  I want freely to export uses of algorithms\n\t\t(like DES & RSA) which are already freely available in the\n\t\tdestination country\n"

In [193]:
import re
import gensim
from gensim.parsing.preprocessing import STOPWORDS
import spacy
model = spacy.load('en')
doc = model('christians windows christ christian dogs utilitarian utility indians christs mexicans')
for token in doc:
  print(token.lemma_)

christians
window
christ
christian
dog
utilitarian
utility
indians
christ
mexicans
