In [480]:
#load libraries
import pandas as pd
import numpy as np
import re
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import scipy.sparse as ss
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.datasets import fetch_20newsgroups

In [481]:
#Load and Format Data


#Categories to load from 20NewsGroups Data Set 
categories = ['comp.graphics',
             'comp.sys.ibm.pc.hardware',
             'misc.forsale',
             'rec.motorcycles',
             'rec.sport.baseball',
             'sci.crypt',
             'sci.electronics',
             'sci.med',
             'sci.space',
             'soc.religion.christian',
             'talk.politics.guns',
             'talk.politics.mideast']

#Load Data
twenty_train = fetch_20newsgroups(subset='train',
                                  remove=('headers', 'footers', 'quotes'),
                                  categories = categories,
                                  shuffle=True, 
                                  random_state=42)

#Convert corpus into pandas data frame
corpus = pd.DataFrame(twenty_train.data,columns = ['text'])
corpus.head()

Unnamed: 0,text
0,"Hello All,\n\nI have a PC Transporter for sale..."
1,\nI've been saying that for at least 2 years n...
2,"\nNevertheless, DWI is F*ckin serious. Hope y..."
3,\n\nthe last arab country was syria. but not a...
4,\n ##flame thrower on## \n Well I don't want m...


In [482]:
#Initialize Preprocessing lists 
#Initialize N-Grams List
ngrams_list = ['united states','new york','law enforcement','los angeles','hard disk',
               'power play','mailing list','serialnumber','health care','window manager',
               'email address','white house','gun control','san francisco','san jose',
               'gordon banks','washington dc','st louis','years old','public key',
               'source code','vocal cord','second amendment','video card','jesus christ',
               'power supply','human rights','last night','young people','public domain',
               'medical newsletter','world war','floppy disk','image processing',
               'senior administration','volumn number','administration official',
               'holy spirit','space shuttle','hockey league','bear arms','electronic mail',
               'third person','space station','federal government',
               'armenian government','san diego','us government','tampa bay','vice president',
               'new testament','high speed','soviet union','private sector']

#Additional corpus specific stopwords
additional_stopwords = ['one','would','people','like','get','dont','know','also',
                        'use','u','make','say','year','could','x','may','good','well',
                        'im','even','new','see','way','thing','right','two','first',
                        'much','many','want','need','go','used','said','question','anyone',
                        'take','come','something','bit','since','using','going','back',
                        'look','really','still','must','might','help','b','got','last',
                        'please','ive','give','sure','cant','without','set','never',
                        'better','another','didnt','doesnt','c','someone','etc','thats',
                        'put','try','least','however','anything','every','second','do',
                       'a','b','c','d','e','f','g','h','i','j','k','l','m','n'
                        ,'o','p','q','r','s','t','u','v','w','x','y','z','let','lets']

#Initialize Word Lemmatizer
lemmatizer = WordNetLemmatizer() 

#Load general NLTK Stopwords
stopwords_list = list(stopwords.words('english'))

#Create single list of Stopwords
stopwords_list.extend(additional_stopwords)

In [483]:
def convert_ngrams(ngrams):
    '''
    Input: List of pairs of n-grams
    Output: Dictionary of ngram pairs to single token (EX: {stop word: stop_word})
    
    '''
    ngram_dict = dict({})
    for i in ngrams:
        ngram_dict.update({i:re.sub(' ','_',i)})
        
    return(ngram_dict)

#Create dictionary of n-grams
ngrams_dict = convert_ngrams(ngrams_list)

In [484]:
def preprocess(line,stopwords_list = stopwords_list,ngrams_dict = ngrams_dict):
    '''
    Input: Single document from corpus
    Output: cleaned document 
    
    Steps:
        1. Removes numbers, new lines, and tabs. Brings to lowercase
        2. Removes punctuation
        3. Removes extra whitespace
        4. Introduce n-grams
        5. Remove stopwords and lemmatize
        6. Strip whitespace
    '''
    #remove all numbers, tabs, and new lines
    line = re.sub('\d+|\n|\t',' ',line.lower())
    #remove all punctuation
    line = re.sub('([!"#$%&\'()*\+,-./:;<=>\?\@\[\\]\^_`{|}~])', ' ', line)
    #remove extra white space
    line = re.sub('\s{2,}', ' ', line)
    #introduce n-grams
    for word_pair in ngrams_dict:
        line = re.sub(word_pair,ngrams_dict[word_pair],line)
   #remove stopwords and lemmatize
    line = ' '.join([lemmatizer.lemmatize(i) for i in line.split(' ') if i not in stopwords_list])
    line = line.strip()
    return(line)

In [485]:
#Apply Preprocessing
corpus['clean_text'] = corpus['text'].apply(preprocess)

In [486]:
#Secondary preprocessing step
'''
Input: Cleaned Corpus
Output: Cleaned Corpus with only words that occur more than 5 times

Purpose: removes low frequency words. Minimal information loss for major performace boost
'''

#Create list of all words in corpus
words = list()
for i in tqdm(corpus['clean_text']):
    words.extend(i.split(' '))

#Get frequency counts of all unique words
counted_words = Counter(words).most_common()[::-1]

#Iterate and create vocab of high frequency words (More than 5 occurances)
vocab = list()
for word,count in tqdm(counted_words):
    if count > 5:
        vocab.append(word)

def low_frequency_word_removal(doc, vocab = vocab):
    '''
    Input: Document and vocabulary
    Output: Document with only words in the vocabulary
    '''
    return(' '.join([i for i in doc.split(' ') if i in vocab]))

#Apply low frequency word removal
corpus['final_text'] = corpus['clean_text'].apply(low_frequency_word_removal)

HBox(children=(IntProgress(value=0, max=7036), HTML(value='')))

HBox(children=(IntProgress(value=0, max=45665), HTML(value='')))

In [487]:
#initialize vectorizer, no n-grams (We already did this earlier)
vectorizer = TfidfVectorizer(ngram_range=(1, 1))

#Vectorize corpus of cleaned data
X = vectorizer.fit_transform(corpus['final_text'] ).toarray()

#Gets the vocabulary of the corpus
vocab = np.array(vectorizer.get_feature_names())

#Convert to Spacy Matrix for input into topic model 
X = ss.csr_matrix(X)

In [498]:
#Initialize anchor words
#create a dictionary of category to topic words

anchor_topic_dict = dict({
                            'comp.graphics':['image','file','graphic','jpeg','format','color','system'
                                             ,'software','data','version','gif','program'],
    
                            'comp.sys.ibm.pc.hardware':['drive','scsi','card','system','mb','controller',
                                                        'problem','bus','ide','pc','work','driver','disk'],
    
                            'misc.forsale':['sale','offer','price','shipping','game','condition','drive'
                                            ,'sell','manual','mail'],
    
                            'rec.motorcycles':['bike','dod','motorcycle','ride','time','helmet','rider',
                                               'road','cop','court','driver','ticket','lawyer','speeding','vehicle'],
    
                            'rec.sport.baseball':['game','team','run','player','hit','baseball','time','win',
                                                  'league','season','cub','play','pitching','pitcher'],
    
                            'sci.crypt':['key','encryption','db','chip','system','privacy','security',
                                         'information','message','algorithm','number','data','file'], 
    
                            'sci.electronics':['wire','circuit','ground','power','work','current',
                                             'line','wiring','voltage','amp','outlet','time','chip',
                                             'number','data','system','radar','mhz','work','number',
                                               'time','information'],
    
                            'sci.med':['patient','time','disease','food','health','doctor','problem',
                                       'study','medical','pain','cancer','research','information'], 
    
                            'sci.space':['space','nasa','launch','satellite','system','time','orbit',
                                         'mission','earth','data','program','lunar','moon','rocket'], 
    
                            'soc.religion.christian':['christian','jesus','church','think',
                                                      'time','believe','faith','bible','christ','life',
                                                      'truth','belief','law','scripture','father',
                                                      'hell','son','love','paul','catholic','christianity'], 
    
                            'talk.politics.guns':['government','criminal','handgun','bill','fire','police',
                                                  'defence','case','militia','gun_control'], 
    
                            'talk.politics.mideast':['armenian','israel','turkish','jew','israeli',
                                                     'time','arab','turkey','greek','turk','state','muslim',
                                                     'armenia','woman','government','killed','jewish',
                                                     'village','azerbaijani','russian','soldier','palestinian',
                                                     'country']
                           })


In [499]:
#Create list of list of topics + words, also create mapping of topic to topic number
anchor_topic_numbers = dict({})
num_to_topic = dict({})
anchor_topics = list()
for i,topic in enumerate(anchor_topic_dict):
    anchor_topic_numbers.update({topic:i})
    num_to_topic.update({i:topic})
    anchor_topics.append(anchor_topic_dict[topic])

In [500]:
#Run Topic Model
#Number of topics
n_Tot_clusters = 12

#COREX is highly dependent on initialization, repeat 4 times and average results 
for i in tqdm(range(4)):
    #Initialize topic model 
    topic_model = ct.Corex(n_hidden=n_Tot_clusters)  
    #Fit topic model 
    topic_model.fit(X, words=vocab, anchors=anchor_topics, anchor_strength=3)
    #Store results
    if i == 0:
        max_topic_temp = topic_model.p_y_given_x
    else:
        max_topic_temp += topic_model.p_y_given_x
#Average Results
max_topic_temp = max_topic_temp/4

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

In [501]:
#get topic assignment 
max_topic = np.argmax(max_topic_temp,axis = 1)

#If the model is completely, uncertain, we assign to noise
noise_index = np.where(np.max(max_topic_temp,axis = 1) == 0.0)

#Add topic assignments to documents
corpus['topic_guess'] = max_topic
corpus['topic_guess'].loc[noise_index] = 13

In [None]:
def get_topic_number(assignment,anchor_topic_numbers = anchor_topic_numbers):
    '''
    Input: Topic Assignment
    Output: Topic Number
    '''
    return(anchor_topic_numbers[assignment])
#Get topic numbers from topic assignments
assignment = list()
for i in range(corpus.shape[0]):
    assignment.append(twenty_train.target_names[twenty_train.target[i]])
#Add true topic and topic number to dataset 
corpus['assignment'] = assignment
corpus['topic_number'] = corpus['assignment'].apply(get_topic_number)

In [511]:
#Display the topic words
#number of topic words to see
num_top_words=4
#gathering topics from corex topic model extraction 
topics = topic_model.get_topics()

#Iterate through the topics to print the words
topic_words_number = list()
for topic_n,topic in enumerate(topics):
    #Get topic words and word parameters
    words,mis = zip(*topic)
    #sort by parameter strength
    index = np.argsort(mis)
    #get the index of top words, only if they are as 
    index = [i for i,j in zip(index,mis)]
    #create string of topic words
    topic_str = num_to_topic[topic_n]+':   '+', '.join(temp[:num_top_words])
    print(topic_str)

comp.graphics:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
comp.sys.ibm.pc.hardware:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
misc.forsale:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
rec.motorcycles:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
rec.sport.baseball:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
sci.crypt:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
sci.electronics:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
sci.med:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
sci.space:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
soc.religion.christian:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
talk.politics.guns:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011
talk.politics.mideast:   armenian:0.016, israel:0.015, israeli:0.012, arab:0.011


In [512]:
from sklearn.metrics import classification_report
target_names = [i for i in anchor_topic_dict]
target_names.append('noise')
print(classification_report(corpus['topic_number'].values, 
                            corpus['topic_guess'].values,
                            target_names = target_names))

                          precision    recall  f1-score   support

           comp.graphics       0.71      0.66      0.68       584
comp.sys.ibm.pc.hardware       0.64      0.78      0.70       590
            misc.forsale       0.64      0.67      0.65       585
         rec.motorcycles       0.43      0.90      0.58       598
      rec.sport.baseball       0.91      0.82      0.86       597
               sci.crypt       0.88      0.72      0.79       595
         sci.electronics       0.84      0.46      0.59       591
                 sci.med       0.84      0.70      0.76       594
               sci.space       0.89      0.70      0.78       593
  soc.religion.christian       0.85      0.90      0.87       599
      talk.politics.guns       0.80      0.71      0.75       546
   talk.politics.mideast       0.92      0.79      0.85       564

             avg / total       0.78      0.73      0.74      7036



                          precision    recall  f1-score   support

           comp.graphics       0.71      0.66      0.68       584
comp.sys.ibm.pc.hardware       0.64      0.78      0.70       590
            misc.forsale       0.64      0.67      0.65       585
         rec.motorcycles       0.43      0.90      0.58       598
      rec.sport.baseball       0.91      0.82      0.86       597
               sci.crypt       0.88      0.72      0.79       595
         sci.electronics       0.84      0.46      0.59       591
                 sci.med       0.84      0.70      0.76       594
               sci.space       0.89      0.70      0.78       593
  soc.religion.christian       0.85      0.90      0.87       599
      talk.politics.guns       0.80      0.71      0.75       546
   talk.politics.mideast       0.92      0.79      0.85       564

             avg / total       0.78      0.73      0.74      7036



comp.graphics:   file:0.012, format:0.007, image:0.006, gif:0.005, graphic:0.004
comp.sys.ibm.pc.hardware:   drive:0.017, scsi:0.013, mb:0.011, card:0.011, controller:0.010
misc.forsale:   sale:0.008, offer:0.008, shipping:0.008, condition:0.004, manual:0.004
rec.motorcycles:   bike:0.021, motorcycle:0.006, helmet:0.006, ride:0.006, dod:0.005
rec.sport.baseball:   team:0.009, game:0.009, player:0.007, pitching:0.006, pitcher:0.005
sci.crypt:   key:0.023, encryption:0.013, chip:0.009, privacy:0.004, security:0.004
sci.electronics:   amp:0.003, dsl:0.009, chastity:0.009, jxp:0.009, shameful:0.009
sci.med:   food:0.006, patient:0.006, disease:0.006, doctor:0.005, pain:0.003
sci.space:   space:0.009, nasa:0.008, orbit:0.007, launch:0.005, moon:0.005
soc.religion.christian:   christian:0.012, jesus:0.012, church:0.009, bible:0.008, christ:0.007
talk.politics.guns:   criminal:0.004, militia:0.004, handgun:0.003, gun:0.009, gun_control:0.002
talk.politics.mideast:   armenian:0.016, israel:0.0

In [508]:
temp = list()
count = 0
topic_test = 'rec.motorcycles'
for text,topic,guess in zip(corpus['final_text'],corpus['assignment'],corpus['topic_guess'] ):
    if topic ==  topic_test and topic_test != num_to_topic[guess]:
        temp.extend(text.split(' '))
        count += 1
        print(text)
        print()

Counter(temp).most_common()

think subject generates contradictory advice traffic law enforcement everybody opinion dead certain yet information extremely difficult cop traffic school instructor vehicle code tell part story judge choose interpret law wide variety way public large seldom hear advice disagree experienced believe suggest copy vehicle code study sit day court happens read fight ticket miss little section end say chance lousy basically screwed guessed pretty system california carefully prepared court bringing witness revealing serious hole officer story maximum fine plus assessment message clear judge appreciate john public trying case advice find ticket traffic school serious matter lawyer lawyer present exact case difference sentence

thank everyone took time respond post fighting ticket wrote successfully fought case court others lost due cop outright lying judge circumstance surrounding ticket fellow lost judge appear mood several suggested obtain book called fight ticket general theme prepared res

[('ticket', 17),
 ('court', 14),
 ('cop', 12),
 ('lawyer', 12),
 ('driver', 11),
 ('insurance', 9),
 ('judge', 8),
 ('boot', 8),
 ('sale', 7),
 ('car', 7),
 ('fight', 6),
 ('speeding', 6),
 ('pay', 6),
 ('traffic', 5),
 ('law', 5),
 ('vehicle', 5),
 ('witness', 5),
 ('show', 5),
 ('piece', 5),
 ('code', 4),
 ('story', 4),
 ('day', 4),
 ('say', 4),
 ('case', 4),
 ('guilty', 4),
 ('u', 4),
 ('edition', 4),
 ('edu', 4),
 ('asking', 4),
 ('advice', 3),
 ('little', 3),
 ('end', 3),
 ('chance', 3),
 ('system', 3),
 ('officer', 3),
 ('called', 3),
 ('around', 3),
 ('list', 3),
 ('radar', 3),
 ('price', 3),
 ('sold', 3),
 ('shape', 3),
 ('later', 3),
 ('look', 3),
 ('trouble', 3),
 ('limit', 3),
 ('contact', 3),
 ('city', 3),
 ('cover', 3),
 ('friend', 3),
 ('drive', 3),
 ('cost', 3),
 ('fault', 3),
 ('crappy', 3),
 ('based', 3),
 ('condition', 3),
 ('maybe', 3),
 ('looked', 3),
 ('citizen', 3),
 ('jonathan', 3),
 ('posted', 3),
 ('crime', 3),
 ('think', 2),
 ('opinion', 2),
 ('extremely', 2),

In [506]:
count

59