In [3]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import pandas as pd
import os
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
import gensim
import spacy
import nltk
from nltk.corpus import stopwords
from spacy import displacy


In [4]:
def load_data(path = r"D:\NLP\dataset"):
    document_files = os.listdir(path)
    doc = [[""]] * len(document_files)
    i = 0
    for document in document_files:
        with open(path + "/" + document, "r") as f:
            doc[i] = f.read()
        i += 1
    return doc

In [7]:
def deal_data(doc):
    news_df = pd.DataFrame({'document': doc})
    # removing everything except alphabets`
    news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")
    # removing short words
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
    # make all text lowercase
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
    nltk.download('stopwords')
    stop_words = stopwords.words('english')
    # tokenization
    tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
    # remove stop-words
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
    detokenized_doc = []
    for i in range(len(news_df)):
        t = ' '.join(tokenized_doc[i])
        detokenized_doc.append(t)
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = get_stop_words('en')
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # compile sample documents into a list
    # list for tokenized documents in loop
    texts = []
    for i in detokenized_doc:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [8]:
class KeyWord():
    def __init__(self,method,num_keywords):
        self.method=method
        self.num_keywords=num_keywords
    def LSA(self,texts,num_keywords):
        detokenized_doc = []
        for i in range(len(texts)):
            t = ' '.join(texts[i])
            detokenized_doc.append(t)
        news_df=pd.DataFrame()
        news_df['clean_doc'] = detokenized_doc
        vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, smooth_idf=True)
        X = vectorizer.fit_transform(news_df['clean_doc'])
        from sklearn.decomposition import TruncatedSVD
        # SVD represent documents and terms in vectors
        svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
        svd_model.fit(X)
        terms = vectorizer.get_feature_names()
        for i, comp in enumerate(svd_model.components_):
            terms_comp = zip(terms, comp)
            sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:num_keywords]
            print("text " + str(i) + "'s keyword: ")
            for t in sorted_terms:
                print(t[0])
                print(" ")
    def LDA(self,texts,num_keywords):
        dictionary = corpora.Dictionary(texts)

        # convert tokenized documents into a document-term matrix
        corpus = [dictionary.doc2bow(text) for text in texts]
        # generate LDA model
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=30)
        print(ldamodel.print_topics(num_topics=5, num_words=num_keywords))
    def spacy1(self,data):
        a=len(data)
        nlp = spacy.load("en_core_web_sm")
        for i in range(a):
            result=self.extract_keywords(nlp,data[i])
            print("text"+str(i)+"'s keyword:")
            print(result)
    def extract_keywords(self,nlp, sequence, special_tags: list = None):
        result = []
        # edit this list of POS tags according to your needs.
        pos_tag = ['PROPN', 'NOUN', 'ADJ']
        doc = nlp(sequence.lower())
        if special_tags:
            tags = [tag.lower() for tag in special_tags]
            for token in doc:
                if token.text in tags:
                    result.append(token.text)
        for chunk in doc.noun_chunks:
            final_chunk = ""
            for token in chunk:
                if (token.pos_ in pos_tag):
                    final_chunk = final_chunk + token.text + " "
            if final_chunk:
                result.append(final_chunk.strip())
        for token in doc:
            if (token.text in nlp.Defaults.stop_words or token.text in punctuation):
                continue
            if (token.pos_ in pos_tag):
                result.append(token.text)
        return list(set(result))
    def spacy2(self,doc):
        for ent in doc.ents:
            print(ent.text, ent.label_)
        displacy.render(doc, style='ent', jupyter=True)
    def fit(self):
        if self.method=='LSA':
            doc=load_data()
            data=deal_data(doc)
            self.LSA(data,num_keywords=self.num_keywords)
        if self.method=='LDA':
            doc = load_data()
            data = deal_data(doc)
            self.LDA(data,num_keywords=self.num_keywords)
        if self.method=='spacy1':
            data=load_data()
            self.spacy1(data)
        if self.method=='spacy2':
            nlp = spacy.load("en_core_web_sm")
            data=load_data()
            a = len(data)
            for i in range(a):
                doc = nlp(data[i].lower())
                print("text" + str(i) + "'s keyword:")
                self.spacy2(doc)
        

In [9]:
if __name__ == '__main__':
    model=KeyWord(method='LSA',num_keywords=20)
    model.fit()

text 0's keyword: 
women
 
matern
 
month
 
said
 
hewitt
 
plan
 
paid
 
sexism
 
elect
 
cost
 
extend
 
work
 
leav
 
week
 
career
 
job
 
mother
 
brown
 
labour
 
rise
 
text 1's keyword: 
blackpool
 
mail
 
inform
 
parti
 
manchest
 
ball
 
confer
 
labour
 
brown
 
rule
 
chancellor
 
delet
 
freedom
 
elect
 
host
 
hotel
 
said
 
spend
 
stabil
 
come
 
text 2's keyword: 
mail
 
inform
 
delet
 
freedom
 
thoma
 
beith
 
commission
 
disclosur
 
law
 
public
 
record
 
said
 
cabinet
 
clear
 
condemn
 
destroy
 
destruct
 
forc
 
guidanc
 
offic
 
text 3's keyword: 
ball
 
brown
 
elect
 
spend
 
stabil
 
chancellor
 
rise
 
britain
 
digbi
 
rate
 
spree
 
think
 
budget
 
alli
 
away
 
cash
 
econom
 
economi
 
expect
 
jone
 
text 4's keyword: 
matern
 
week
 
month
 
plan
 
democrat
 
liber
 
mother
 
alreadi
 
said
 
allow
 
children
 
desper
 
entitl
 
father
 
firm
 
misdirect
 
older
 
parent
 
extend
 
right
 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Angelo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
model=KeyWord(method='LDA',num_keywords=20)
model.fit()

[(0, '0.002*"said" + 0.002*"women" + 0.002*"elect" + 0.002*"labour" + 0.002*"month" + 0.002*"mail" + 0.002*"hewitt" + 0.002*"work" + 0.002*"paid" + 0.002*"plan" + 0.002*"brown" + 0.002*"parti" + 0.002*"inform" + 0.002*"sexism" + 0.002*"told" + 0.002*"govern" + 0.002*"matern" + 0.002*"extend" + 0.002*"rule" + 0.002*"cost"'), (1, '0.002*"said" + 0.002*"mail" + 0.002*"inform" + 0.002*"freedom" + 0.002*"thoma" + 0.002*"time" + 0.002*"record" + 0.002*"commission" + 0.002*"rule" + 0.002*"public" + 0.002*"delet" + 0.002*"beith" + 0.002*"disclosur" + 0.002*"cabinet" + 0.002*"govern" + 0.002*"prevent" + 0.002*"come" + 0.002*"import" + 0.002*"condemn" + 0.002*"destruct"'), (2, '0.028*"said" + 0.018*"elect" + 0.017*"month" + 0.017*"matern" + 0.015*"brown" + 0.011*"labour" + 0.011*"plan" + 0.011*"ball" + 0.011*"rise" + 0.009*"chancellor" + 0.008*"mother" + 0.008*"right" + 0.008*"week" + 0.008*"stabil" + 0.008*"budget" + 0.008*"spend" + 0.008*"extend" + 0.008*"extra" + 0.008*"cost" + 0.008*"hewitt"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Angelo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
model=KeyWord(method='spacy1',num_keywords=20)
model.fit()

text0's keyword:
['gidley', 'liberal', 'money', 'government statutory pay', 'liberal democrat spokeswoman', 'small', 'brown', 'length', 'average', 'salary costs', 'shadow', 'budget', 'period', 'children', 'advertising costs', 'end', 'working', 'first months', 'plans', 'months', 'recruitment costs', 'gordon brown', 'labour', 'gordon', 'majority', 'shadow secretary', 'many women', 'parliament', 'flexible', 'democrat', 'earnings', 'women voters', 'gmtv', 'plan', 'employers', 'bid', 'democrats', 'baby', 'family', 'new proposals', 'parents', 'many small firms', 'director general', 'strain', 'other proposals', 'proposals', '%', 'maternity pay plan', 'maternity', 'weeks', 'pre - budget review', 'firm commitment', 'maternity pay', 'general', 'gmtv sunday programme', 'details', 'firms', 'average earnings', 'mothers', 'firm', 'aim', 'recruitment', 'labour plans', 'crippled', 'desperate', 'frost', 'party', 'old', 'first weeks', 'fathers', 'tony blair', 'patricia', 'government', 'extra', 'full mon

In [12]:
model=KeyWord(method='spacy2',num_keywords=20)
model.fit()

text0's keyword:
labour ORG
patricia hewitt PERSON
nine months by 2007 DATE
gmtv PERSON
sunday DATE
democrats NORP
13 weeks DATE
up to 26 weeks DATE
nine months by 2007 DATE
the full 12 months DATE
12 months DATE
six CARDINAL
the six months DATE
nine months DATE
gordon brown PERSON
december DATE
tony blair PERSON
conservatives NORP
democrat NORP
sandra gidley PERSON
democrats NORP
the first six months DATE
david frost PERSON
british NORP
commerce ORG
monday DATE
90% PERCENT
the first six weeks DATE
a week DATE
six months old DATE


text1's keyword:
watchdog ORG
delete e-mails PERSON
more than three months old DATE
richard thomas PERSON
1 january DATE
thomas PERSON
tories NORP
michael howard PERSON
tony blair PERSON
monday DATE
lib dem constitutional affairs committee ORG
alan beith PERSON
millions CARDINAL
hutton ORG
thomas PERSON
the new act of parliament LAW
thomas PERSON
england GPE
wales GPE
northern ireland GPE
next year DATE
scotland GPE
about 100,000 CARDINAL


text2's keyword:
six months DATE
patricia hewitt PERSON
the equal opportunities commission ORG
eoc ORG
up to six months' DATE
six CARDINAL
the first six months DATE
the second six months DATE
19% PERCENT
bbc radio 4's ORG
today DATE
about six CARDINAL
20 CARDINAL
only 10% PERCENT
about 80p CARDINAL
60p DATE
the department for trade and industry ORG
hundreds CARDINAL
last year DATE
less than 10% PERCENT
100 CARDINAL
just over half CARDINAL


text3's keyword:
manchester

 PERSON
the labour party ORG
2006 DATE
national executive committee ORG
first ORDINAL
1917 DATE
annual DATE
february spring DATE
years DATE
bournemouth PERSON
brighton GPE
annual DATE
1998 DATE
2002 DATE
the following year bournemouth DATE
two year DATE
blackpool hotel association ORG
the labour party ORG


text4's keyword:
brown PERSON
gordon brown's PERSON
16 march DATE
treasury ORG
balls PERSON
digby jones PERSON
cbi ORG
balls PERSON
treasury ORG
brown PERSON
the labour party's ORG
january DATE
the bank of england's ORG
monetary policy committee ORG
britain GPE
bbc radio 4's ORG
today DATE
balls PERSON
brown PERSON
2001 DATE
british NORP
britain GPE
balls PERSON
brown PERSON
gordon brown PERSON
third ORDINAL
britain GPE
balls PERSON
today DATE
tony blair PERSON
5 CARDINAL
