In [None]:
#Charles Swedensky
#CSC570 Data Science Essentials
#Module 1 Week 4: Latent Semantic Analysis
#26 Jan 2017

In [48]:
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
categories = ['talk.politics.guns']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
gunCorpus = dataset.data

In [49]:
#get stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Charles\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
dataset.filenames.shape

(910L,)

In [51]:
#just checking the contents to see if I've got the right corpus
gunCorpus[0]

u'From: pat@rwing.UUCP (Pat Myrto)\nSubject: Re: FBI Director\'s Statement on Waco Standoff\nDistribution: misc.legal,tx.general,tx.politics.talk.politics.guns,alt.law-enforcement\nOrganization: Totally Unorganized\nLines: 36\n\n\nIn article <C5w0C9.2D0@intellection.com> emcguire@intellection.com (Ed McGuire) writes:\n>In <1993Apr21.182458.12735@aio.jsc.nasa.gov> news&aio.jsc.nasa.gov (USENET) News (brenda kenworthy) writes:\n>\n>>And another thing that puzzles\n>>me--why are they finding dead bodies inside who had bullet holes already in \n>>them???  Don\'t you think it\'s possible that Koresh shot the TRAITORS rather \n>>than letting them out???\n>\n>Possible.  I wouldn\'t put it past him.  It is also possible that they\n>were hit by rounds exploding in the extreme heat.  Remember that kept\n>the cops away for hours.  I have only heard that bodies were found\n>shot, not any coroner\'s cause of death.\n\nSo far, the medical examiner (according to the news) has found NO EVIDENCE\nof gu

In [52]:
#yep, that's about guns, alright
#next bring in the stopwords
stopset = set(stopwords.words('english'))
#add URL junk to stopword set
stopset.update(['com', 'edu', 'www', 'http', 'https', 'sw', 'uiuc', 'nntp', 'cs', 'cdt', 'wpi', 'dave', 'david', 'org', 'batf', ])

In [53]:
#ngram_range -- check for important phrases up to three-words long
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(gunCorpus)

In [54]:
#weighted values for the bag of words in the corpus, minus stopwords
print(X[0])

  (0, 88877)	0.0336220696118
  (0, 207537)	0.0336220696118
  (0, 205288)	0.0336220696118
  (0, 107994)	0.0336220696118
  (0, 221868)	0.0336220696118
  (0, 112135)	0.0336220696118
  (0, 212826)	0.0336220696118
  (0, 219856)	0.0336220696118
  (0, 234022)	0.0336220696118
  (0, 158105)	0.0336220696118
  (0, 185629)	0.0336220696118
  (0, 161810)	0.0336220696118
  (0, 225134)	0.0336220696118
  (0, 218908)	0.0336220696118
  (0, 80754)	0.0336220696118
  (0, 73251)	0.0336220696118
  (0, 228124)	0.0336220696118
  (0, 189448)	0.0336220696118
  (0, 144293)	0.0336220696118
  (0, 158093)	0.0336220696118
  (0, 5665)	0.0336220696118
  (0, 220389)	0.0336220696118
  (0, 166796)	0.0336220696118
  (0, 234400)	0.0336220696118
  (0, 225112)	0.0336220696118
  :	:
  (0, 128400)	0.00632237600005
  (0, 222361)	0.0322143660337
  (0, 216814)	0.0263503780885
  (0, 154527)	0.00654127187709
  (0, 74184)	0.0231611193264
  (0, 123297)	0.0160983912251
  (0, 20424)	0.0303995204763
  (0, 99531)	0.0146191839217
  (0, 2095

In [55]:
#documents x terms
X.shape

(910, 241301)

In [56]:
#truncated singular value decomposition
#decompose matrix X
lsa = TruncatedSVD(n_components=910, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=910, n_iter=100,
       random_state=None, tol=0.0)

In [57]:
#that took awhile
#list the calculated importance for the first concept
lsa.components_[0]

array([  5.54065116e-03,   5.02554557e-04,   5.02554557e-04, ...,
         7.72209123e-06,   7.72209123e-06,   7.72209123e-06])

In [60]:
#correlate the calculated weights to their actual English words
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    #just print the first 25 concepts -- 910 is too many
    if i < 25:
        termsInComp = zip (terms,comp)
        sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
        print("Concept %d:" % i )
        for term in sortedTerms:
            print(term[0])
        print (" ")

Concept 0:
stratus
gun
would
people
fbi
writes
article
never
fire
one
 
Concept 1:
stratus
rocket stratus
irvine
rocket
cso
electric
stove
stratus tavares
tavares
brent
 
Concept 2:
handheld
never
roby
jmd
udel
never never
jim
fbi
chopin
chopin udel
 
Concept 3:
irvine
cso
indiana
brent
brent irvine
irvine uxh
irvine uxh cso
uxh
uxh cso
electric
 
Concept 4:
fire
oldham
feustel
blast
proper equipment
compound
roby
equipment
waco
fbi
 
Concept 5:
roby
udel
betz
chopin
chopin udel
gozer
idbsu
roby chopin
roby chopin udel
fbi
 
Concept 6:
feustel
feustel netcom
netcom
dseg
ti
pyron
government
netcom feustel
dseg ti
skndiv
 
Concept 7:
dseg
ti
pyron
dseg ti
skndiv
skndiv dseg
skndiv dseg ti
pyron skndiv
pyron skndiv dseg
dillon pyron
 
Concept 8:
indiana
ucs
ucs indiana
silver ucs
silver ucs indiana
funny
silver
psych indiana
psych
nate
 
Concept 9:
kratz
uic
uicvm
andy
jason
gang
uicvm uic
auto
stanford
semi
 
Concept 10:
manes
000
rate
gun
linknet
magpie
magpie linknet
uk
deaths
handgun


### Some familiar patterns emerged from the concept groups. The most impressive one to me is the "gun buy back" three-word phrase. Some of the other relationships the TF-IDF tied together were "gun safety" and "paranoid freaks". There are also some flashes of insight in words like "militia", "Waco", and "government". 

### I tried my best to remove as many URL-related stopwords as possible, but I think they still had some impact. Some seemingly meaningless numbers appeared in some of the concept groups as well as what seems to be the names of universities, which might be indicitive of in-work citations made by the authors of these writings.

### Practical uses of this might include categorizing disorganized text data or maybe as a tool to keep a data mining algorithm "interested" in the right subject matter.