In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
import numpy as np

In [2]:
f=open('Language_data.txt','r',errors='ignore')
input_text=f.read()


In [3]:
# Sentence tokenizer 
print("\nSentence tokenizer:")
print(sent_tokenize(input_text))

# Word tokenizer
print("\nWord tokenizer:")
df=word_tokenize(input_text)
print(df)

# WordPunct tokenizer
print("\nWord punct tokenizer:")
print(WordPunctTokenizer().tokenize(input_text))


Sentence tokenizer:
['Attracting and retaining the mindshare of your customer base is a challenge that most enterprises are constantly struggling with.', 'To improve your brand recall, you need to constantly generate quality content that is relevant and engaging and properly appropriated for circulation in a variety of outlets.', 'Here comes Generative AI, which offers new capabilities to augment content creation.', 'Using generative AI, Enterprises can create a variety of content like images, videos, and written material and decrease turnaround time.']

Word tokenizer:
['Attracting', 'and', 'retaining', 'the', 'mindshare', 'of', 'your', 'customer', 'base', 'is', 'a', 'challenge', 'that', 'most', 'enterprises', 'are', 'constantly', 'struggling', 'with', '.', 'To', 'improve', 'your', 'brand', 'recall', ',', 'you', 'need', 'to', 'constantly', 'generate', 'quality', 'content', 'that', 'is', 'relevant', 'and', 'engaging', 'and', 'properly', 'appropriated', 'for', 'circulation', 'in', 'a',

In [4]:
lemmatizer = WordNetLemmatizer()

# Create a list of lemmatizer names for display
lemmatizer_names = ['NOUN LEMMATIZER', 'VERB LEMMATIZER']
formatted_text = '{:>24}' * (len(lemmatizer_names) + 1)
print('\n', formatted_text.format('INPUT WORD', *lemmatizer_names), '\n', '='*75)

# Lemmatize each word and display the output
for word in df:
    output = [word, lemmatizer.lemmatize(word, pos='n'),
           lemmatizer.lemmatize(word, pos='v')]
    print(formatted_text.format(*output))


               INPUT WORD         NOUN LEMMATIZER         VERB LEMMATIZER 
              Attracting              Attracting              Attracting
                     and                     and                     and
               retaining               retaining                  retain
                     the                     the                     the
               mindshare               mindshare               mindshare
                      of                      of                      of
                    your                    your                    your
                customer                customer                customer
                    base                    base                    base
                      is                      is                      be
                       a                       a                       a
               challenge               challenge               challenge
                    that                    that

In [8]:
# Create various stemmer objects
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

# Create a list of stemmer names for display
stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{:>16}' * (len(stemmer_names) + 1)
print('\n', formatted_text.format('INPUT WORD', *stemmer_names), '\n', '='*68)

# Stem each word and display the output
for word in df:
    output = [word, porter.stem(word), lancaster.stem(word), snowball.stem(word)]
    print(formatted_text.format(*output))


       INPUT WORD          PORTER       LANCASTER        SNOWBALL 
      Attracting         attract         attract         attract
             and             and             and             and
       retaining          retain          retain          retain
             the             the             the             the
       mindshare        mindshar          mindsh        mindshar
              of              of              of              of
            your            your              yo            your
        customer          custom          custom          custom
            base            base             bas            base
              is              is              is              is
               a               a               a               a
       challenge        challeng        challeng        challeng
            that            that            that            that
            most            most            most            most
     enterprises      

In [6]:
def chunker(input_data, N):
    output = []

    cur_chunk = []
    count = 0
    for word in df:
        cur_chunk.append(word)
        count += 1
        if count == N:
            output.append(' '.join(cur_chunk))
            count, cur_chunk = 0, []

    output.append(' '.join(cur_chunk))

    return output 

if __name__=='__main__':
    # Read the first 12000 words from the Brown corpus
    #input_data = ' '.join(brown.words()[:12000])

    # Define the number of words in each chunk 
    chunk_size = 10

    chunks = chunker(df, chunk_size)
    print('\nNumber of text chunks =', len(chunks), '\n')
    for i, chunk in enumerate(chunks):
        print('Chunk', i+1, '==>', chunk[:])


Number of text chunks = 9 

Chunk 1 ==> Attracting and retaining the mindshare of your customer base is
Chunk 2 ==> a challenge that most enterprises are constantly struggling with .
Chunk 3 ==> To improve your brand recall , you need to constantly
Chunk 4 ==> generate quality content that is relevant and engaging and properly
Chunk 5 ==> appropriated for circulation in a variety of outlets . Here
Chunk 6 ==> comes Generative AI , which offers new capabilities to augment
Chunk 7 ==> content creation . Using generative AI , Enterprises can create
Chunk 8 ==> a variety of content like images , videos , and
Chunk 9 ==> written material and decrease turnaround time .


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
f_new=open('BoW_data.txt','r',errors='ignore')
input_text_new=f_new.read()
df_new=word_tokenize(input_text_new)


In [14]:
def chunker(input_data, N):
    output = []

    cur_chunk = []
    count = 0
    for word in df_new:
        cur_chunk.append(word)
        count += 1
        if count == N:
            output.append(' '.join(cur_chunk))
            count, cur_chunk = 0, []

    output.append(' '.join(cur_chunk))

    return output 

In [17]:
chunk_size = 250

text_chunks = chunker(df_new, chunk_size)

# Convert to dict items
chunks = []
for count, chunk in enumerate(text_chunks):
    d = {'index': count, 'text': chunk}
    chunks.append(d)

# Extract the document term matrix
count_vectorizer = CountVectorizer(min_df=6, max_df=20)
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])

# Extract the vocabulary and display it
vocabulary = np.array(count_vectorizer.get_feature_names())
print("\nVocabulary:\n", vocabulary)

# Generate names for chunks
chunk_names = []
for i in range(len(text_chunks)):
    chunk_names.append('Chunk-' + str(i+1))

# Print the document term matrix
print("\nDocument term matrix:")
formatted_text = '{:>12}' * (len(chunk_names) + 1)
print('\n', formatted_text.format('Word', *chunk_names), '\n')
for word, item in zip(vocabulary, document_term_matrix.T):
    # 'item' is a 'csr_matrix' data structure
    output = [word] + [str(freq) for freq in item.data]
    print(formatted_text.format(*output))


Vocabulary:
 ['and' 'are' 'for' 'in' 'like' 'models' 'of' 'the' 'to' 'will']

Document term matrix:

         Word     Chunk-1     Chunk-2     Chunk-3     Chunk-4     Chunk-5     Chunk-6 

         and          12          14          10           9          13          11
         are           2           3           3           2           1           1
         for           4           2           2           4           3           3
          in           5           6           7           7           5           4
        like           2           1           4           2           1           4
      models           2           4           1           2           1           2
          of           6           9           7           3           1           1
         the           9           9           7          12           9           6
          to           5           8           8           6           5           4
        will           1           3         