In [5]:
# Import statements
import numpy as np
import pandas as pd
import os
import sys
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prachal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/prachal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/prachal/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [10]:
# Load the data into pandas frame
data_frame = pd.DataFrame()
for i in range(1,4,1):
    try:
        
        path = '/Users/prachal/Desktop/Study/SJSU/Sem-2/255/Group project/data/articles'+str(i)+'.csv'
        if os.path.exists(path):
            chunk_list = []
        reader_obj = pd.read_csv(path,chunksize=10000) 
        for chunk in reader_obj:
            chunk_list.append(chunk)    
        data_frame = pd.concat([data_frame,pd.concat(chunk_list).drop(['Unnamed: 0'],axis=1)],ignore_index = True)
    except:
        # handle the file not found error
        print(sys.exc_info())
    
print(data_frame.columns)
print(data_frame.shape)

Index(['id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url',
       'content'],
      dtype='object')
(142570, 9)


In [11]:
data_frame['content'][0]

'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been 

## Display a simple text

In [13]:
sample_content = data_frame['content'][10]
print(sample_content)

With Donald J. Trump about to take control of the White House, it would seem a dark time for the renewable energy industry. After all, Mr. Trump has mocked the science of global warming as a Chinese hoax, threatened to kill a global deal on climate change and promised to restore the coal industry to its former glory. So consider what happened in the middle of December, after investors had had a month to absorb the implications of Mr. Trump’s victory. The federal government opened bidding on a tract of the ocean floor off New York State as a potential site for a huge wind farm. Up, up and away soared the offers  —   interest from the bidders was so fevered that the auction went through 33 rounds and spilled over to a second day. In the end, the winning bidder offered the federal Treasury $42 million, more than twice what the government got in August for oil leases  —   oil leases  —   in the Gulf of Mexico. Who won the bid? None other than Statoil, the Norwegian oil company, which is in

# Tokenization
### Tokenizations is the process of separating each and every small letter of the sentence.

# Removal of Stop Words: 
### In this process we are also eliminating the stop words in order to extract only words 


In [14]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
def tokenization(content):
    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(content.lower())
    return token_list
token_list = tokenization(sample_content)
# print(token_list)
print(len(token_list))

1184


In [15]:
import nltk
words = set(nltk.corpus.words.words())

def remove_non_english(content):
    return " ".join(w for w in nltk.wordpunct_tokenize(content) 
            if w.lower() in words or not w.isalpha())

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prachal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
def stop_words_filter(token_list):
    stopword_set = set(stopwords.words('english'))    
    filtered_tokens = []
    for token in token_list:
        if token not in stopword_set and token.isnumeric() == False:
            filtered_tokens.append(token)
    return filtered_tokens

filtered_tokens = stop_words_filter(token_list)
# print(filtered_tokens)

### As you can see we were able to remove a lot of unnecessary words from the tokens

### Stemming and lemmatization
#### For grammatical reasons, documents are going to use different forms of a word, such as organize, organizes, and organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy, democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these words to return documents that contain another word in the set.

#### The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For instance:

#### &emsp;  &emsp; am, are, is $\Rightarrow$ be
#### &emsp;  &emsp; car, cars, car's, cars' $\Rightarrow$ car
#### The result of this mapping of text will be something like:
#### &emsp;  &emsp; the boy's cars are different colors $\Rightarrow$
#### &emsp;  &emsp; the boy car be differ color
#### However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma . If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma. Linguistic processing for stemming or lemmatization is often done by an additional plug-in component to the indexing process, and a number of such components exist, both commercial and open-source.

#### For more information refer: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

### NOTE: In the below method if you uncomment the two print lines you will be able to see what this function is doing. This will help to improve the performance of count-based clustering techniques. Also it will reduce the size of sparse matrix.

#### Example there are many examples which are being lemmatized like 
##### eg 1 . rounds => round 
##### eg 2 . leases => lease
##### eg 3 . jobs => job
##### eg 4 . appointees => appointee


In [18]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/prachal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
def lemmatize_tokens(filtered_tokens):
    lemmatized = []
    lemmatizer = WordNetLemmatizer()
    for token in filtered_tokens:
        lemmatized.append(lemmatizer.lemmatize(token))
        #print("token: " + token)
        #print("Lemmantized "+lemmatizer.lemmatize(token))
    lemmatized_string = ' '.join(lemmatized)
    return lemmatized_string
lemmatized_string = lemmatize_tokens(filtered_tokens)

In [20]:
# Create a data pipeline to process the dataset 
from nltk.tokenize import word_tokenize
def data_preprocessing(data_frame):

    res = pd.DataFrame()
    '''

    # Step 1: Tokenization
    tmp_data_frame = data_frame['content'][0:1000].apply(lambda row:tokenization(row))

    # Step 2: Remove stop words
    tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))

    # Step 3: Make a string
    tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))

    res = pd.concat([res,tmp_data_frame])

    '''
    
    total_records = len(data_frame)  #50000
    start = 0 
    interval = ((total_records - start) // 50 ) #1000
    # The processing in chunks will reduce the memory load
    for i in range(start,total_records,interval):
        
#         print(i)
        # Step 1: Tokenization
        tmp_data_frame = data_frame['content'][i:i+interval].apply(lambda row:tokenization(row))
        
        # Step 2: Remove stop words
        tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))
        
        # Step 3: Make a string
        tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))
        
        #Step 4: Reomve non-english words
        tmp_data_frame = tmp_data_frame.apply(lambda row: remove_non_english(row))
            
        res = pd.concat([res,tmp_data_frame])
        
    res.columns = ['content']
    return res

df = data_preprocessing(data_frame)
print(df.shape)

(142570, 1)


In [21]:
len(df['content'][0].split())

388

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
def create_sparse_mat(corpus):
    vectorizer = CountVectorizer()
    x = vectorizer.fit_transform(corpus)
    word_features = vectorizer.get_feature_names()
#     print(word_features)
    return x

sparse_mat = create_sparse_mat(df['content'])
# print(sparse_mat.shape)    


### Save the sparse matrix to file 

In [23]:
import scipy.sparse
scipy.sparse.save_npz('sparse_mat.npz', sparse_mat)

In [24]:
print(sparse_mat.shape)
len(df['content'][1].split())

(142570, 58626)


1794

In [39]:
df['content'][0]

'congressional new fear comes health care lawsuit administration might win incoming trump administration could choose longer defend executive branch suit administration authority spend health insurance house big victory sudden loss could conceivably cause health care program implode leaving millions people without access health insurance prepared replacement could lead chaos insurance market spur political backlash gain full control government stave outcome could find awkward position huge temporarily prop health care law conservative demanding end law another twist j trump administration worried executive branch could choose fight republican allies house central dispute eager avoid ugly political hill trump transition team gaming handle lawsuit election put limbo least late united court district circuit yet ready divulge strategy given pending litigation administration congress would inappropriate comment said j spokesman trump transition effort upon taking office trump administration

# Non english words are removed from the dataset

In [25]:
list(df.columns)

['content']

In [26]:
lda_frame = data_frame.drop(['content'],axis=1)

In [27]:
lda_frame['content'] = df


In [28]:
lda_frame.shape

(142570, 9)

In [29]:
list(lda_frame.columns)

['id',
 'title',
 'publication',
 'author',
 'date',
 'year',
 'month',
 'url',
 'content']

In [30]:
print(lda_frame['title'][0])
print(lda_frame['content'][0])

House Republicans Fret About Winning Their Health Care Suit - The New York Times
congressional new fear comes health care lawsuit administration might win incoming trump administration could choose longer defend executive branch suit administration authority spend health insurance house big victory sudden loss could conceivably cause health care program implode leaving millions people without access health insurance prepared replacement could lead chaos insurance market spur political backlash gain full control government stave outcome could find awkward position huge temporarily prop health care law conservative demanding end law another twist j trump administration worried executive branch could choose fight republican allies house central dispute eager avoid ugly political hill trump transition team gaming handle lawsuit election put limbo least late united court district circuit yet ready divulge strategy given pending litigation administration congress would inappropriate comment 

In [31]:
lda_frame.to_csv('reduced_words.csv')

In [32]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/prachal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
from nltk.tokenize import word_tokenize

In [34]:
content = lda_frame['content']

In [37]:
content[0]

'congressional new fear comes health care lawsuit administration might win incoming trump administration could choose longer defend executive branch suit administration authority spend health insurance house big victory sudden loss could conceivably cause health care program implode leaving millions people without access health insurance prepared replacement could lead chaos insurance market spur political backlash gain full control government stave outcome could find awkward position huge temporarily prop health care law conservative demanding end law another twist j trump administration worried executive branch could choose fight republican allies house central dispute eager avoid ugly political hill trump transition team gaming handle lawsuit election put limbo least late united court district circuit yet ready divulge strategy given pending litigation administration congress would inappropriate comment said j spokesman trump transition effort upon taking office trump administration

In [38]:
tokens = [word_tokenize(i) for i in content]

In [39]:
len(tokens[0])

388

In [40]:
from langdetect import detect
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


In [41]:
from gensim.models import Phrases




In [42]:
bigram_model = Phrases(tokens)
print(len(bigram_model[tokens]))
#trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(bigram_model[tokens])


142570


In [43]:
from gensim import corpora

In [44]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

In [66]:
len(corpus[0])

224

In [45]:
from gensim import models
import numpy as np

In [46]:
np.random.seed(123456)
num_topics = 20 # Same as number of Clusters 
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

CPU times: user 36min 30s, sys: 1min 50s, total: 38min 20s
Wall time: 26min 11s


In [47]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+": "+ topic)
    print()

0: 0.018*"company" + 0.015*"data" + 0.015*"apple" + 0.013*"technology" + 0.012*"news" + 0.012*"new" + 0.010*"twitter" + 0.009*"post" + 0.009*"information" + 0.009*"video" + 0.008*"use" + 0.008*"used" + 0.008*"social_media" + 0.007*"media" + 0.006*"network" + 0.006*"site" + 0.006*"digital" + 0.006*"phone" + 0.006*"service" + 0.005*"tech"

1: 0.011*"president" + 0.011*"russia" + 0.010*"former" + 0.010*"investigation" + 0.009*"u" + 0.008*"white_house" + 0.008*"report" + 0.008*"administration" + 0.008*"intelligence" + 0.008*"information" + 0.008*"government" + 0.007*"committee" + 0.007*"public" + 0.007*"director" + 0.006*"office" + 0.006*"told" + 0.006*"post" + 0.006*"national_security" + 0.005*"official" + 0.005*"whether"

2: 0.055*"state" + 0.023*"school" + 0.020*"university" + 0.018*"public" + 0.017*"city" + 0.015*"new_york" + 0.013*"group" + 0.011*"education" + 0.010*"foundation" + 0.010*"national" + 0.010*"local" + 0.009*"million" + 0.009*"year" + 0.009*"college" + 0.008*"governor" + 

In [48]:
print(lda_frame.content.loc[0][:200])

congressional new fear comes health care lawsuit administration might win incoming trump administration could choose longer defend executive branch suit administration authority spend health insurance


In [49]:
lda_model[corpus[0]]


[(1, 0.07336922),
 (8, 0.01582897),
 (9, 0.35642895),
 (17, 0.1664257),
 (18, 0.0350849),
 (19, 0.3524413)]

In [50]:
topics = [lda_model[corpus[i]] for i in range(len(lda_frame))]

In [51]:
document = '''china soccer vast football school south china seem barely enough backdrop school seem young swarm onto nearly every day kicking passing hope soccer glory riches soccer career grow
wang kai gangly studied boarding school three said morning session supervision coach want said next become national project china country fan president xi bent transforming country great soccer 
power china whose ranked poor middling recent international competition effort already unleashed surge spending support game around world last two main league plucked foreign south worth much million 
year highest pay soccer player world club real million year declined agent said last week shaking landscape pro soccer conte manager fabled team spending spree last month danger world drive match china 
economic ascent success soccer field become emblematic xi ambition transform china great confident power biggest hope soccer become among world best last two government kind concentrated effort soccer 
previously devoted winning individual sports like diving gymnastics clean reorganize professional soccer build new generation soccer soccer aim establish flow top eventually capable winning men world cup
team former glory effort spend lavishly well paying millions foreign team spent millions tap coaching marketing current spending massive said professor sports enterprise university spending big also soccer
xi national also could falter muddle rushed distorted enforcement especially local level resistance worried taking precious time away well fear foreign money attention fostering talent fixing soccer turns bit 
like fixing economy desire quick flashy success risk people daily main newspaper communist party last month bubble reckless spending professional soccer could burst badly damage sport many feverish going 
young newspaper said one biggest said resident shanghai wild east football sport china great level like people thing china passionate soccer would competitive national instead lackluster national men team
recently 83rd ahead remote outcrop unlikely win spot world cup team pride soccer past world cup slipped 13th latest national team joke said come stadium watch favorite team clobber listless opponent province 
think need get right question spending money attitude domestic professional game corruption brazen even china since grew national scandal worst cheating still said blatantly xi soccer passion since childhood
abroad included soccer famously enthusiastic seemingly rusty go kicking ball old school learned kick fan game according former teacher look healthy xi told young soccer school laid basis sports young private
piled professional soccer xi backing game apparently eager curry favor government main pro trading season last year super league spent million away promising foreign player spending premier league nearly million 
according player transfer data company likely go even higher xi focus long game next generation plan strong emphasis soccer leap number soccer across country grow end plan million million regularly play soccer every
school paying quite bit attention soccer said athletic director r xi old school school unthinkable yet deep cultural resistance even discourage time sports said much homework face stiff competition academic china individual
sports demand intense discipline early age country done well fostering group sports like teamwork improvisation count much personal virtuosity privately run school world biggest soccer boarding school formula intense training
combined solid education could show way young soccer built multiply said principal school guess seven eight half national squad come school drawn pay year send oversee training spend day also play promising get get school said 
even come game later south often lack solid grounding teamwork tactics said coach getting better year year said hopefully approach expensive widely copied facing shortage space like soccer gymnastics stand tossing ball around may
impress visiting scant preparation free flow game said widely soccer commentator soccer rushing instant success said interview previous build game 1980s 1990s problem everyone thinking still deeply set traditional everyone soccer
getting competition training instead encouraging focus fun broad participation approach break monotony classroom eventually bring future approach trying way recent afternoon smog often elementary school rushed onto shouting squealing
delight morning soccer smog said principal lin midday notified back went crazy relief'''
tokens = word_tokenize(document)
topics = lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20)
pd.DataFrame([(el[0], round(el[1],2), topics[el[0]][1]) for el in lda_model[dictionary_LDA.doc2bow(tokens)]], columns=['topic #', 'weight', 'words in topic'])

Unnamed: 0,topic #,weight,words in topic
0,2,0.08,"0.055*""state"" + 0.023*""school"" + 0.020*""univer..."
1,3,0.03,"0.081*""percent"" + 0.016*""year"" + 0.016*""u"" + 0..."
2,4,0.01,"0.017*""city"" + 0.008*""area"" + 0.007*""home"" + 0..."
3,6,0.04,"0.014*""like"" + 0.013*""know"" + 0.011*""get"" + 0...."
4,7,0.04,"0.010*""white"" + 0.009*""black"" + 0.009*""politic..."
5,8,0.05,"0.022*""company"" + 0.018*""million"" + 0.014*""bus..."
6,11,0.11,"0.013*""like"" + 0.011*""even"" + 0.009*""much"" + 0..."
7,12,0.42,"0.020*""game"" + 0.019*""team"" + 0.015*""season"" +..."
8,14,0.03,"0.011*""show"" + 0.007*""like"" + 0.007*""new"" + 0...."
9,15,0.14,"0.022*""u"" + 0.021*""united"" + 0.020*""china"" + 0..."


In [52]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

# End of LDA