In [1]:
### Running LDA analysis on summarized (condensed) text generated from articles.

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(1729)
import nltk
import pandas as pd
from nltk.corpus import stopwords

In [3]:
ORIGINAL_TEXT = ['It can take up to 15 seconds of laser fire to bring down a UAV or destroy its camera',
 'This was not a sudden development but has been going on for most of the last decade',
 'In 2010 the navy successfully tested this new laser weapon, which is actually six solid-state lasers acting in unison, to destroy a small UAV',
 'In 2013 another test was run, under more realistic conditions',
 '\nIsrael claims a breakthrough in the development of lasers that can be used to intercept mortar shells, UAVs and rockets',
 'Most objects fired at Israel end up landing in unoccupied areas and the few objects that are dangerous are intercepted by missiles',
 'Fire control systems for quickly, accurately and repeatedly aiming a laser have already been developed',
 'Navy system already installed on one warship for several years and about to be installed on several more',
 'This was crucial because knocking down UAVs is not something that the navy needs help with',
 'It never worked, at least not in a practical sense',
 'Even if ALT worked flawlessly it did not have enough energy to hit a launching missile from a safe (from enemy fire) distance',
 'This has proved very effective.\n\nLaser Dome is described as using a solid-state electric laser at an effective range of 5,000 meters',
 'Laser Dome combines multiple laser beams to obtain a useful amount of laser power at longer ranges',
 'In 2018 LaWAS was moved to a large amphibious ship for continued testing and two more LaWAS are being built, for delivery and installation on two more ships in 2020',
 'Army CLWS (Compact Laser Weapon System) which is currently only capable of handling UAVs',
 'This is the tech that Laser Dome claims to have improved enough to destroy UAVs with one shot and at longer ranges.\n\nAnother example is a U.S',
 'The manufacturer convinced the navy that it was just a matter of tweaking the technology to get the needed effectiveness',
 'But the ability to do enough damage to disable boats or missiles that are over two kilometers distant meant the LaWS was worth mounting on a warship.\n\nLaWS may yet prove incapable of working under combat conditions, but so far this new development has kept passing tests',
 'But in heaver sand storms performance was much reduced',
 'In other words, LaWAS is still a work in progress.\n\nSuch was not the case with an earlier research effort using chemical lasers']

In [4]:
def pre_process_text(ORIGINAL_TEXT):
    """Polishes text"""
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    STOPWORDS.add("like")
    STOPWORDS.add("said")
    STOPWORDS.add("forward")
    STOPWORDS.add("time")
    frp = []
    for i, c in enumerate(ORIGINAL_TEXT):
        reg = c.lower()
        reg = ' '.join(reg)
        reg = ' '.join([word for word in c.split() if word not in STOPWORDS])
        reg = re.sub('[^a-zA-Z]', ' ', reg)
        reg = re.sub(r'\s+', ' ', reg)
        frp.append(reg)
    return frp

In [5]:
POLISHED_TEXT = pre_process_text(ORIGINAL_TEXT)

In [6]:
POLISHED_TEXT

['It take seconds laser fire bring UAV destroy camera',
 'This sudden development going last decade',
 'In navy successfully tested new laser weapon actually six solid state lasers acting unison destroy small UAV',
 'In another test run realistic conditions',
 'Israel claims breakthrough development lasers used intercept mortar shells UAVs rockets',
 'Most objects fired Israel end landing unoccupied areas objects dangerous intercepted missiles',
 'Fire control systems quickly accurately repeatedly aiming laser already developed',
 'Navy system already installed one warship several years installed several',
 'This crucial knocking UAVs something navy needs help',
 'It never worked least practical sense',
 'Even ALT worked flawlessly enough energy hit launching missile safe from enemy fire distance',
 'This proved effective Laser Dome described using solid state electric laser effective range meters',
 'Laser Dome combines multiple laser beams obtain useful amount laser power longer rang

In [7]:
#Incorporating stemming instead of lemmatization because of performance and speed.

In [8]:
stemmer = SnowballStemmer('english')

In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [10]:
lemmantized = list(pd.Series(POLISHED_TEXT).map(preprocess))

In [11]:
dictionary = gensim.corpora.Dictionary(lemmantized)
dictionary.filter_tokens()

In [12]:
def generate_frequencies(sentence):
    """
    Generates Frequencies and Occurances of words from a sentence:
    1. Parameters : sentence (lemmantized version of the sentence, in list type).
    2. Returns : 
        A. TDM (pandas Series object)
        B. DataFrame (pandas TDM representation)
    """
    occurance = {}
    frequency = {}
    for i, word in enumerate(sentence):
        if word not in frequency.keys():
            frequency[word] = 1
        else:
            frequency[word] += 1
    max_word_frequency = max(frequency.values())
    for word in frequency.keys():
        occurance[word] = frequency[word] / max_word_frequency
    df = pd.DataFrame(data=[list(frequency.keys()), list(frequency.values()), list(occurance.values())]).T
    df.columns = ['Word', 'Occurance', 'Frequency']
    return df, frequency, occurance

In [13]:
def generate_tf_idf(paragraph):
    tf_idf = []
    for i, lem in enumerate(paragraph):
        pd_df, frequency_words, occurance_words = generate_frequencies(lem)
        temp = []
        for occur, freq in zip(frequency_words.values(), occurance_words.values()):
            temp.append(freq * np.log10(len(ORIGINAL_TEXT)/occur))
        tf_idf.append(temp)
    return tf_idf

In [14]:
### TD-IDF generation

In [15]:
tf_idf = generate_tf_idf(lemmantized)

In [16]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
lemmy_BOW = [dictionary.doc2bow(text) for text in lemmantized]

In [17]:
from gensim import corpora, models

tfidf = models.TfidfModel(lemmy_BOW)

In [18]:
corpus_tfidf = tfidf[lemmy_BOW]

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=3)

In [20]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.015*"destroy" + 0.014*"seconds" + 0.014*"bring" + 0.014*"camera" + 0.013*"lawas" + 0.013*"navy" + 0.013*"help" + 0.013*"needs" + 0.013*"knocking" + 0.013*"uavs"
Topic: 1 
Word: 0.017*"laser" + 0.015*"practical" + 0.015*"sense" + 0.015*"effective" + 0.015*"objects" + 0.013*"worked" + 0.012*"dome" + 0.011*"repeatedly" + 0.011*"developed" + 0.011*"quickly"
Topic: 2 
Word: 0.016*"installed" + 0.015*"development" + 0.014*"conditions" + 0.013*"realistic" + 0.013*"test" + 0.012*"weapon" + 0.012*"sudden" + 0.012*"decade" + 0.012*"going" + 0.012*"warship"


In [21]:
lda_model = gensim.models.LdaMulticore(lemmy_BOW, minimum_probability=0.2, num_topics=5, id2word=dictionary, passes=3, workers=3)

In [22]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.058*"laser" + 0.030*"installed" + 0.030*"navy" + 0.017*"worked" + 0.017*"longer" + 0.017*"flawlessly" + 0.017*"convinced" + 0.017*"launching" + 0.017*"ranges" + 0.017*"matter"
Topic: 1 
Words: 0.040*"lasers" + 0.022*"navy" + 0.022*"destroy" + 0.022*"solid" + 0.022*"state" + 0.022*"acting" + 0.022*"actually" + 0.022*"small" + 0.022*"tested" + 0.022*"successfully"
Topic: 2 
Words: 0.053*"laser" + 0.029*"uavs" + 0.029*"destroy" + 0.029*"capable" + 0.029*"weapon" + 0.029*"army" + 0.029*"handling" + 0.029*"clws" + 0.029*"compact" + 0.029*"currently"
Topic: 3 
Words: 0.041*"uavs" + 0.041*"israel" + 0.040*"objects" + 0.023*"crucial" + 0.023*"knocking" + 0.023*"needs" + 0.023*"help" + 0.023*"navy" + 0.023*"breakthrough" + 0.023*"rockets"
Topic: 4 
Words: 0.035*"laser" + 0.024*"dome" + 0.024*"laws" + 0.024*"lawas" + 0.024*"effective" + 0.024*"conditions" + 0.014*"missiles" + 0.013*"claims" + 0.013*"development" + 0.013*"uavs"
