In [None]:
# pre-requisite
# 1. install spacy, gensim
# 2. python -m spacy download en

# Import

In [42]:
import re
import pandas as pd
import numpy as np
import spacy
from pprint import pprint
import matplotlib.pyplot as plt
import joblib 
from gensim.utils import simple_preprocess
from gensim.models import Phrases, phrases

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Load Data

In [4]:
df = pd.read_csv('newsgroups.csv')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0.1,Unnamed: 0,content,target,target_names
0,0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


# Preprocess

In [6]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [20]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [27]:
# Build the bigram
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod = phrases.Phraser(bigram)

# See bigram example
print(bigram_mod[data_words[0]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting', 'host', 'rac_wam', 'umd_edu', 'organization', 'university', 'of', 'maryland_college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [28]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [29]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['thing', 'car', 'nntp_poste', 'host', 'rac_wam', 'park', 'line', 'wonder', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


# Vectorize

In [35]:
corpus = [' '.join(sentence) for sentence in data_lemmatized]
corpus[:1]

['thing car nntp_poste host rac_wam park line wonder enlighten car see day door sport car look late early call door really small addition front_bumper separate rest body know tellme model name engine spec year production car make history info funky look car mail thank bring neighborhood lerxst']

In [37]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf = tfidf_vectorizer.fit_transform(corpus)

# Topic Modelling

In [39]:
n_components = 20
n_top_words = 20

In [40]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

In [44]:
def get_model_topics(model, vectorizer, n_top_words=n_top_words):
    word_dict = {}
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        word_dict[topic_idx] = top_features

    return pd.DataFrame(word_dict)

In [52]:
def get_inference(model, vectorizer, text, threshold=0, topics=list(range(n_components))):
    v_text = vectorizer.transform([text])
    score = model.transform(v_text)

    labels = set()
    for i in range(len(score[0])):
        if score[0][i] > threshold:
            labels.add(topics[i])

    if not labels:
        return 'None', -1, set()

    return topics[np.argmax(score)], score, labels

## NMF

In [49]:
nmf = NMF(n_components=n_components, random_state=42, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [50]:
print(get_model_topics(nmf, tfidf_vectorizer, n_top_words=10))

        0            1       2        3        4                5          6   \
0      say       window    team    drive      key             scsi        car   
1   people           do  player     disk      bit              ide     engine   
2    write          run    play     boot     chip           device        buy   
3    think      program    year   jumper  encrypt       controller     dealer   
4  article  application     win     tape   secret              bit      price   
5       go      problem  season  problem   public        interface      model   
6     make         font  hockey   switch   escrow             fast       mile   
7     know          use    good   format      use              bus  insurance   
8      see       screen     fan      ide  message  scsi_controller     saturn   
9      get    ms_window     nhl   floppy   system             chip      speed   

        7            8             9          10         11              12  \
0     card          gun      

In [91]:
topic, score, _ = get_inference(nmf, tfidf_vectorizer, corpus[0])
print(topic, score)
np.max(score)

6 [[2.06746787e-03 0.00000000e+00 1.40370995e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.50892832e-01 0.00000000e+00
  0.00000000e+00 1.66724937e-02 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]]


0.15089283214659036

In [92]:
topics = []
nmf_scores = []
for text in corpus:
    topic, score, _ = get_inference(nmf, tfidf_vectorizer, text)
    topics.append(topic)
    nmf_scores.append(np.max(score))

In [93]:
df_data = [df.content.values.tolist(), corpus, topics, nmf_scores]
df_nmf = pd.DataFrame()
df_nmf['original_text'] = df.content.values.tolist()
df_nmf['processed_text'] = corpus
df_nmf['nmf_topic'] = topics
df_nmf['topic_score'] = nmf_scores
df_nmf

Unnamed: 0,original_text,processed_text,nmf_topic,topic_score
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,thing car nntp_poste host rac_wam park line wo...,6,0.150893
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,final call summary final call si clock report ...,9,0.022004
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,question organization purdue_university engine...,9,0.030069
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,division line distribution_world nntp_poste ho...,9,0.032953
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,question organization smithsonian_astrophysica...,9,0.016379
...,...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,jim_zisfein migraine scan distribution_world o...,0,0.018571
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,screen death line organization old problem scr...,3,0.029737
11311,From: westes@netcom.com (Will Estes)\nSubject:...,este mount cpu cooler case organization versio...,9,0.007528
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,steven collin sphere point organization centra...,13,0.041650


In [94]:
df_nmf.to_csv('nmf_result.csv', mode='a')

## LSA

## LDA