In [1]:
### Running LDA analysis on summarized (condensed) text generated from articles.

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(1729)
import nltk
import pandas as pd
from nltk.corpus import stopwords

In [3]:
ORIGINAL_TEXT = ['It can take up to 15 seconds of laser fire to bring down a UAV or destroy its camera',
 'This was not a sudden development but has been going on for most of the last decade',
 'In 2010 the navy successfully tested this new laser weapon, which is actually six solid-state lasers acting in unison, to destroy a small UAV',
 'In 2013 another test was run, under more realistic conditions',
 '\nIsrael claims a breakthrough in the development of lasers that can be used to intercept mortar shells, UAVs and rockets',
 'Most objects fired at Israel end up landing in unoccupied areas and the few objects that are dangerous are intercepted by missiles',
 'Fire control systems for quickly, accurately and repeatedly aiming a laser have already been developed',
 'Navy system already installed on one warship for several years and about to be installed on several more',
 'This was crucial because knocking down UAVs is not something that the navy needs help with',
 'It never worked, at least not in a practical sense',
 'Even if ALT worked flawlessly it did not have enough energy to hit a launching missile from a safe (from enemy fire) distance',
 'This has proved very effective.\n\nLaser Dome is described as using a solid-state electric laser at an effective range of 5,000 meters',
 'Laser Dome combines multiple laser beams to obtain a useful amount of laser power at longer ranges',
 'In 2018 LaWAS was moved to a large amphibious ship for continued testing and two more LaWAS are being built, for delivery and installation on two more ships in 2020',
 'Army CLWS (Compact Laser Weapon System) which is currently only capable of handling UAVs',
 'This is the tech that Laser Dome claims to have improved enough to destroy UAVs with one shot and at longer ranges.\n\nAnother example is a U.S',
 'The manufacturer convinced the navy that it was just a matter of tweaking the technology to get the needed effectiveness',
 'But the ability to do enough damage to disable boats or missiles that are over two kilometers distant meant the LaWS was worth mounting on a warship.\n\nLaWS may yet prove incapable of working under combat conditions, but so far this new development has kept passing tests',
 'But in heaver sand storms performance was much reduced',
 'In other words, LaWAS is still a work in progress.\n\nSuch was not the case with an earlier research effort using chemical lasers']

In [4]:
def pre_process_text(ORIGINAL_TEXT):
    """Polishes text"""
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    STOPWORDS.add("like")
    STOPWORDS.add("said")
    STOPWORDS.add("forward")
    STOPWORDS.add("time")
    frp = []
    for i, c in enumerate(ORIGINAL_TEXT):
        reg = c.lower()
        reg = ' '.join(reg)
        reg = ' '.join([word for word in c.split() if word not in STOPWORDS])
        reg = re.sub('[^a-zA-Z]', ' ', reg)
        reg = re.sub(r'\s+', ' ', reg)
        frp.append(reg)
    return frp

In [5]:
POLISHED_TEXT = pre_process_text(ORIGINAL_TEXT)

In [6]:
POLISHED_TEXT

['It probably name earned semi human status forgot water last year s degree summer heat still managed thrive',
 ' It s pretty cool textile says Amanda Johnston who together expo s founder Nina Marenzi curated vast array fabrics offer',
 'For Future Fabrics Expo London week cactus leather one array new ethical man made fibres display',
 'Veganism exploded that s transcending materials seeing developed too Read more Could You Give Up Buying Clothes In Cactus pineapple apple leather show Future Fabrics Expo Paul Cochrane Image may contain Plant Human Person Fruit Food Pineapple Established Future Fabrics Expo Selfridges sustainable textiles edited selection best offerings market today',
 'The event one stop shop environmentally conscious designers fashion industry one polluting planet it s relevant ever As result I m happy report I went London s Victoria House attend three day event buzzing',
 'The expo organised The Sustainable Angle not for profit organisation initiates supports sustain

In [7]:
#Incorporating stemming instead of lemmatization because of performance and speed.

In [8]:
stemmer = SnowballStemmer('english')

In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [10]:
lemmantized = list(pd.Series(POLISHED_TEXT).map(preprocess))

In [11]:
dictionary = gensim.corpora.Dictionary(lemmantized)
dictionary.filter_tokens()

In [12]:
def generate_frequencies(sentence):
    """
    Generates Frequencies and Occurances of words from a sentence:
    1. Parameters : sentence (lemmantized version of the sentence, in list type).
    2. Returns : 
        A. TDM (pandas Series object)
        B. DataFrame (pandas TDM representation)
    """
    occurance = {}
    frequency = {}
    for i, word in enumerate(sentence):
        if word not in frequency.keys():
            frequency[word] = 1
        else:
            frequency[word] += 1
    max_word_frequency = max(frequency.values())
    for word in frequency.keys():
        occurance[word] = frequency[word] / max_word_frequency
    df = pd.DataFrame(data=[list(frequency.keys()), list(frequency.values()), list(occurance.values())]).T
    df.columns = ['Word', 'Occurance', 'Frequency']
    return df, frequency, occurance

In [13]:
def generate_tf_idf(paragraph):
    tf_idf = []
    for i, lem in enumerate(paragraph):
        pd_df, frequency_words, occurance_words = generate_frequencies(lem)
        temp = []
        for occur, freq in zip(frequency_words.values(), occurance_words.values()):
            temp.append(freq * np.log10(len(ORIGINAL_TEXT)/occur))
        tf_idf.append(temp)
    return tf_idf

In [14]:
### TD-IDF generation

In [15]:
tf_idf = generate_tf_idf(lemmantized)

In [16]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
lemmy_BOW = [dictionary.doc2bow(text) for text in lemmantized]

In [17]:
from gensim import corpora, models

tfidf = models.TfidfModel(lemmy_BOW)

In [18]:
corpus_tfidf = tfidf[lemmy_BOW]

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=3)

In [20]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.008*"week" + 0.008*"display" + 0.008*"ethical" + 0.008*"plant" + 0.008*"future" + 0.007*"chic" + 0.007*"verdant" + 0.007*"likely" + 0.007*"covered" + 0.007*"green"
Topic: 1 
Word: 0.009*"friend" + 0.009*"wearing" + 0.009*"sustainable" + 0.007*"event" + 0.007*"consider" + 0.007*"fabric" + 0.007*"fashion" + 0.007*"cactus" + 0.007*"seeing" + 0.006*"designers"
Topic: 2 
Word: 0.012*"plants" + 0.012*"shelf" + 0.012*"leather" + 0.010*"house" + 0.010*"consider" + 0.009*"marenzi" + 0.009*"says" + 0.009*"grape" + 0.008*"research" + 0.008*"spend"


In [21]:
lda_model = gensim.models.LdaMulticore(lemmy_BOW, minimum_probability=0.2, num_topics=5, id2word=dictionary, passes=3, workers=3)

In [22]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.023*"fabrics" + 0.023*"array" + 0.023*"expo" + 0.023*"cactus" + 0.022*"founder" + 0.012*"offer" + 0.012*"says" + 0.012*"johnston" + 0.012*"marenzi" + 0.012*"curated"
Topic: 1 
Words: 0.032*"leather" + 0.022*"fabric" + 0.012*"little" + 0.012*"grape" + 0.012*"spend" + 0.012*"blurbs" + 0.012*"huge" + 0.012*"research" + 0.012*"marenzi" + 0.012*"says"
Topic: 2 
Words: 0.016*"event" + 0.016*"fashion" + 0.016*"sleeve" + 0.009*"designers" + 0.009*"house" + 0.009*"london" + 0.009*"report" + 0.009*"happy" + 0.009*"stop" + 0.009*"relevant"
Topic: 3 
Words: 0.021*"human" + 0.021*"cactus" + 0.021*"expo" + 0.021*"pineapple" + 0.021*"fabrics" + 0.021*"future" + 0.012*"year" + 0.012*"heat" + 0.012*"thrive" + 0.012*"degree"
Topic: 4 
Words: 0.039*"sustainable" + 0.028*"fashion" + 0.027*"plant" + 0.015*"fabric" + 0.015*"expo" + 0.015*"verdant" + 0.015*"nestled" + 0.015*"green" + 0.015*"covered" + 0.015*"prickles"
