In [1]:
import streamlit as st
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import gensim
from gensim.models import Word2Vec
import numpy as np

In [2]:
content = """
Natural language processing (NLP) is a subfield of computer science and artificial intelligence (AI) that uses machine learning to enable computers to understand and communicate with human language.NLP enables computers and digital devices to recognize, understand and generate text and speech by combining computational linguistics—the rule-based modeling of human language—together with statistical modeling, machine learning (ML) and deep learning.NLP research has enabled the era of generative AI, from the communication skills of large language models (LLMs) to the ability of image generation models to understand requests. NLP is already part of everyday life for many, powering search engines, prompting chatbots for customer service with spoken commands, voice-operated GPS systems and digital assistants on smartphones.NLP also plays a growing role in enterprise solutions that help streamline and automate business operations, increase employee productivity and simplify mission-critical business processes.
"""

In [3]:
sentences = nltk.sent_tokenize(content)

In [4]:
words = nltk.word_tokenize(content)

In [5]:
corpus = []
stemming = PorterStemmer()
lemmatize = WordNetLemmatizer()

for sentence in sentences:
    words = word_tokenize(sentence)  
    filtered_words = []
    for word in words:
        word = re.sub(r"[,.()]", " ", word)
        word = re.sub("[^a-zA-Z]", "", word)
        word = lemmatize.lemmatize(word)
        word = word.lower()
        if word and word not in stopwords.words('english'):
            filtered_words.append(word)
    # Add the filtered words of the sentence to the corpus
    if filtered_words:
        corpus.append(filtered_words)


In [6]:
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

2024-08-13 22:17:07.727 INFO    gensim.models.word2vec: collecting all words and their counts
2024-08-13 22:17:07.728 INFO    gensim.models.word2vec: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-08-13 22:17:07.728 INFO    gensim.models.word2vec: collected 85 word types from a corpus of 99 raw words and 2 sentences
2024-08-13 22:17:07.729 INFO    gensim.models.word2vec: Creating a fresh vocabulary
2024-08-13 22:17:07.729 INFO    gensim.utils: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 85 unique words (100.00% of original 85, drops 0)', 'datetime': '2024-08-13T22:17:07.729670', 'gensim': '4.3.3', 'python': '3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-08-13 22:17:07.730 INFO    gensim.utils: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 99 word corpus (100.00% of original 99, drops 0)', 'datetime': '2024-08-13T22:1

In [7]:
model.wv.index_to_key

['understand',
 'computer',
 'digital',
 'ai',
 'modeling',
 'human',
 'learning',
 'machine',
 'model',
 'business',
 'language',
 'nlp',
 'enabled',
 'computational',
 'linguisticsthe',
 'rulebased',
 'languagetogether',
 'statistical',
 'ml',
 'ha',
 'deep',
 'era',
 'speech',
 'learningnlp',
 'research',
 'combining',
 'process',
 'text',
 'generate',
 'recognize',
 'device',
 'communication',
 'enables',
 'languagenlp',
 'communicate',
 'enable',
 'us',
 'intelligence',
 'artificial',
 'science',
 'subfield',
 'processing',
 'generative',
 'large',
 'skill',
 'voiceoperated',
 'system',
 'assistant',
 'smartphonesnlp',
 'also',
 'play',
 'growing',
 'role',
 'enterprise',
 'solution',
 'help',
 'streamline',
 'automate',
 'operation',
 'increase',
 'employee',
 'productivity',
 'simplify',
 'gps',
 'command',
 'missioncritical',
 'spoken',
 'llms',
 'ability',
 'image',
 'generation',
 'request',
 'already',
 'part',
 'everyday',
 'life',
 'many',
 'powering',
 'search',
 'engine'

In [8]:
model.corpus_count

2

In [9]:
sv = []
for sentence in corpus:
    wv = [model.wv[word] for word in sentence if word in model.wv]
    
    if wv:  
        sentence_vector = np.mean(wv, axis=0)  
    else:
        sentence_vector = np.zeros(model.vector_size) 
    
    sv.append(sentence_vector)
sv   
    

[array([-2.8959188e-05,  4.7350611e-04,  6.4723019e-04,  1.9013547e-04,
        -2.7775078e-04, -1.3800929e-03,  1.0569778e-03,  1.4916268e-03,
        -1.0229605e-03, -1.4249383e-03,  6.7169784e-04, -1.5043070e-03,
        -3.3956437e-04,  6.4421369e-04,  4.6630049e-04, -2.8897010e-04,
         1.4717258e-03,  2.3834567e-04, -5.5088819e-04, -2.3022473e-03,
         2.0914518e-04,  6.6685065e-04,  1.4642999e-03, -5.3536781e-04,
        -2.0381941e-04,  8.9065533e-04, -5.0788769e-04,  4.2513173e-04,
        -7.1755279e-04,  6.0918741e-04,  1.1077836e-03, -9.1778173e-04,
         1.0007798e-03, -2.8632958e-03, -6.9332232e-06,  9.3185349e-04,
         1.1620580e-03, -5.5402680e-04, -4.6247084e-04, -7.7671767e-04,
         1.8634855e-04, -1.4470353e-03, -7.3379301e-04,  2.0319106e-04,
         1.1749469e-03, -9.4917422e-04, -8.1605773e-04, -5.8202480e-04,
         5.6263048e-04,  1.9837706e-04,  3.5234602e-04, -5.5839756e-04,
        -3.2755287e-04,  9.7095457e-05, -5.1894702e-04,  1.49667

In [10]:
dv = np.mean(sv, axis=0)
dv

array([-2.4410376e-04,  8.6047186e-04, -2.2189581e-04,  1.9386223e-04,
        7.3233055e-04, -1.6012976e-03,  5.7874556e-04,  2.1274395e-03,
       -9.5612125e-04, -1.0521092e-03, -2.8578864e-04, -1.2988085e-03,
       -6.7722271e-05,  6.6066656e-04,  1.0958965e-03, -4.8451358e-04,
       -1.0269688e-04, -4.3760118e-04, -3.2028338e-04, -1.8759780e-03,
        6.3030352e-04,  3.8383741e-04,  7.4054743e-04, -4.9895589e-04,
       -3.4983957e-04, -3.3313088e-04, -8.9975924e-04, -1.6674247e-04,
       -5.1248760e-04,  2.4475864e-04,  5.5249600e-04, -3.5172229e-04,
        5.7013339e-04, -1.2517020e-03, -3.8835386e-04,  1.1906822e-03,
        4.9909839e-04, -9.7502241e-05, -8.0650003e-05, -1.2115105e-03,
        9.1612696e-05, -2.3830263e-04, -2.5329282e-04, -3.9160637e-05,
        5.2928360e-04, -6.0719409e-05, -7.5653201e-04, -1.8650478e-04,
        8.7952078e-04,  5.7746249e-04,  2.7452191e-04, -7.0903281e-04,
        3.9570063e-05, -2.5271781e-04,  2.0994764e-04, -3.0971269e-04,
      

In [11]:
sentence_score = []
for i, sv in enumerate(sv):
    score = np.dot(sv, dv)  
    rs = " ".join(corpus[i])
    sentence_score.append((rs, score))

sentence_score

[('natural language processing nlp subfield computer science artificial intelligence ai us machine learning enable computer understand communicate human languagenlp enables computer digital device recognize understand generate text speech combining computational linguisticsthe rulebased modeling human languagetogether statistical modeling machine learning ml deep learningnlp research ha enabled era generative ai communication skill large language model llms ability image generation model understand request',
  4.919895e-05),
 ('nlp already part everyday life many powering search engine prompting chatbots customer service spoken command voiceoperated gps system digital assistant smartphonesnlp also play growing role enterprise solution help streamline automate business operation increase employee productivity simplify missioncritical business process',
  5.9663125e-05)]

In [12]:
sentence_score.sort(key=lambda x: x[1], reverse=True)

In [13]:
summary_length = 1

In [14]:
summary = " ".join([sentence for sentence, score in sentence_score[:summary_length]])

In [15]:
summary

'nlp already part everyday life many powering search engine prompting chatbots customer service spoken command voiceoperated gps system digital assistant smartphonesnlp also play growing role enterprise solution help streamline automate business operation increase employee productivity simplify missioncritical business process'