In [11]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from docx import Document

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

def load_document(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def tokenize_document(document):
    tokens = word_tokenize(document)
    return [word.lower() for word in tokens if word.isalpha()]

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def find_morphology(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(10)


document_path = '/content/text-to-word.docx'
document = load_document(document_path)

tokens = tokenize_document(document)
tokens_without_stopwords = remove_stopwords(tokens)
morphology = find_morphology(tokens_without_stopwords)


print("Top 10 frequent words in the document (after removing stopwords):")
for word, frequency in morphology:
    print(f"{word}: {frequency}")

Top 10 frequent words in the document (after removing stopwords):
language: 3
nlp: 3
human: 2
machine: 2
learning: 2
natural: 1
processing: 1
field: 1
artificial: 1
intelligence: 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
