In [4]:
pip install python-docx


Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184489 sha256=4e325c505737fcac5e00d9b818d03c51321a00977d1bed28ca441b536ba458d8
  Stored in directory: /root/.cache/pip/wheels/80/27/06/837436d4c3bd989b957a91679966f207bfd71d358d63a8194d
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [5]:
import docx
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.tag import pos_tag

In [7]:
import docx

def read_word_file(file_path):
    doc = docx.Document(file_path)
    content = [para.text for para in doc.paragraphs]
    return "\n".join(content)

In [8]:
file_path = "/content/Standard_Service_Agreement[1].docx"
text_content = read_word_file(file_path)

Tokenization

In [9]:
def tokenize_and_count(text):
    tokens = word_tokenize(text)
    return len(tokens), tokens

Stop-word removal and count words

In [11]:
def remove_stop_words_and_count(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return len(filtered_words), filtered_words

Build vocabulary (unique words) and find its size

In [12]:
def build_vocabulary(words):
    vocabulary = set(words)
    return len(vocabulary), vocabulary

Stemming and count words

In [13]:
def stem_and_count(words):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return len(stemmed_words), stemmed_words

Lemmatization and count words

In [14]:
def lemmatize_and_count(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return len(lemmatized_words), lemmatized_words

Get most frequent words after stop-word removal

In [15]:
def get_most_frequent_words(words, n=10):
    fdist = FreqDist(words)
    most_frequent = fdist.most_common(n)
    return most_frequent

Count Nouns and Adjectives after stemming/lemmatization

In [16]:
def count_nouns_and_adjectives(words):
    tagged_words = pos_tag(words)
    nouns = len([word for word, tag in tagged_words if tag.startswith('NN')])
    adjectives = len([word for word, tag in tagged_words if tag.startswith('JJ')])
    return nouns, adjectives

In [21]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [27]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [28]:
input_file_path = "/content/Standard_Service_Agreement[1].docx"
# Read the content of the Word file
text_content = read_word_file(input_file_path)
# Tokenization and count tokens
num_tokens, tokens = tokenize_and_count(text_content)
# Stop-word removal and count words
num_words_without_stopwords, words_without_stopwords = remove_stop_words_and_count(tokens)
# Build vocabulary and find its size
vocab_size, vocabulary = build_vocabulary(words_without_stopwords)
# Stemming and count words
num_words_stemmed, stemmed_words = stem_and_count(words_without_stopwords)
# Lemmatization and count words
num_words_lemmatized, lemmatized_words = lemmatize_and_count(words_without_stopwords)
# Get 10 most frequent words after stop-word removal
most_frequent_words = get_most_frequent_words(words_without_stopwords, n=10)
# Count Nouns and Adjectives after stemming/lemmatization
num_nouns, num_adjectives = count_nouns_and_adjectives(stemmed_words)

In [29]:
# Display the results
print("Number of Tokens:", num_tokens)
print("Number of Words (after stop-word removal):", num_words_without_stopwords)
print("Vocabulary Size:", vocab_size)
print("Number of Words (after stemming):", num_words_stemmed)
print("Number of Words (after lemmatization):", num_words_lemmatized)
print("Most 10 Frequent Words:", most_frequent_words)
print("Number of Nouns:", num_nouns)
print("Number of Adjectives:", num_adjectives)

Number of Tokens: 9401
Number of Words (after stop-word removal): 5838
Vocabulary Size: 1465
Number of Words (after stemming): 5838
Number of Words (after lemmatization): 5838
Most 10 Frequent Words: [(',', 542), ('Service', 197), ('Provider', 192), ('.', 176), ('NetApp', 172), ('shall', 137), ('Agreement', 116), (')', 109), ('(', 105), ('Information', 48)]
Number of Nouns: 3016
Number of Adjectives: 881
