In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import numpy as np

In [2]:
doc="""Natural language processing (NLP) is the ability of a computer program to understand human language as it's spoken and written -- referred to as natural language. It's a component of artificial intelligence (AI).

NLP has existed for more than 50 years and has roots in the field of linguistics. It has a variety of real-world applications in numerous fields, including medical research, search engines and business intelligence.

NLP uses either rule-based or machine learning approaches to understand the structure and meaning of text. It plays a role in chatbots, voice assistants, text-based scanning programs, translation applications and enterprise software that aids in business operations, increases productivity and simplifies different processes.

"""

In [3]:
# Tokenization
tokens = word_tokenize(doc)


In [4]:
# POS Tagging
pos_tags = pos_tag(tokens)


In [5]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]


In [6]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]


In [7]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]


In [8]:
# Print Results
print("Sample Document:")
print(doc)

Sample Document:
Natural language processing (NLP) is the ability of a computer program to understand human language as it's spoken and written -- referred to as natural language. It's a component of artificial intelligence (AI).

NLP has existed for more than 50 years and has roots in the field of linguistics. It has a variety of real-world applications in numerous fields, including medical research, search engines and business intelligence.

NLP uses either rule-based or machine learning approaches to understand the structure and meaning of text. It plays a role in chatbots, voice assistants, text-based scanning programs, translation applications and enterprise software that aids in business operations, increases productivity and simplifies different processes.




In [9]:
print("\nTokenization:")
print(tokens)


Tokenization:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'the', 'ability', 'of', 'a', 'computer', 'program', 'to', 'understand', 'human', 'language', 'as', 'it', "'s", 'spoken', 'and', 'written', '--', 'referred', 'to', 'as', 'natural', 'language', '.', 'It', "'s", 'a', 'component', 'of', 'artificial', 'intelligence', '(', 'AI', ')', '.', 'NLP', 'has', 'existed', 'for', 'more', 'than', '50', 'years', 'and', 'has', 'roots', 'in', 'the', 'field', 'of', 'linguistics', '.', 'It', 'has', 'a', 'variety', 'of', 'real-world', 'applications', 'in', 'numerous', 'fields', ',', 'including', 'medical', 'research', ',', 'search', 'engines', 'and', 'business', 'intelligence', '.', 'NLP', 'uses', 'either', 'rule-based', 'or', 'machine', 'learning', 'approaches', 'to', 'understand', 'the', 'structure', 'and', 'meaning', 'of', 'text', '.', 'It', 'plays', 'a', 'role', 'in', 'chatbots', ',', 'voice', 'assistants', ',', 'text-based', 'scanning', 'programs', ',', 'translation', 'applicati

In [10]:
print("\nPOS Tagging:")
print(pos_tags)


POS Tagging:
[('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('the', 'DT'), ('ability', 'NN'), ('of', 'IN'), ('a', 'DT'), ('computer', 'NN'), ('program', 'NN'), ('to', 'TO'), ('understand', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('as', 'IN'), ('it', 'PRP'), ("'s", 'VBZ'), ('spoken', 'JJ'), ('and', 'CC'), ('written', 'VBN'), ('--', ':'), ('referred', 'VBD'), ('to', 'TO'), ('as', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('a', 'DT'), ('component', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('(', '('), ('AI', 'NNP'), (')', ')'), ('.', '.'), ('NLP', 'NNP'), ('has', 'VBZ'), ('existed', 'VBN'), ('for', 'IN'), ('more', 'JJR'), ('than', 'IN'), ('50', 'CD'), ('years', 'NNS'), ('and', 'CC'), ('has', 'VBZ'), ('roots', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('field', 'NN'), ('of', 'IN'), ('linguistics', 'NNS'), ('.', '.'), ('It', 'PRP'), ('has', 'VBZ'), ('a

In [11]:
print("\nStop Words Removal:")
print(filtered_tokens)


Stop Words Removal:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'ability', 'computer', 'program', 'understand', 'human', 'language', "'s", 'spoken', 'written', '--', 'referred', 'natural', 'language', '.', "'s", 'component', 'artificial', 'intelligence', '(', 'AI', ')', '.', 'NLP', 'existed', '50', 'years', 'roots', 'field', 'linguistics', '.', 'variety', 'real-world', 'applications', 'numerous', 'fields', ',', 'including', 'medical', 'research', ',', 'search', 'engines', 'business', 'intelligence', '.', 'NLP', 'uses', 'either', 'rule-based', 'machine', 'learning', 'approaches', 'understand', 'structure', 'meaning', 'text', '.', 'plays', 'role', 'chatbots', ',', 'voice', 'assistants', ',', 'text-based', 'scanning', 'programs', ',', 'translation', 'applications', 'enterprise', 'software', 'aids', 'business', 'operations', ',', 'increases', 'productivity', 'simplifies', 'different', 'processes', '.']


In [12]:
print("\nStemming:")
print(stemmed_tokens)


Stemming:
['natur', 'languag', 'process', '(', 'nlp', ')', 'abil', 'comput', 'program', 'understand', 'human', 'languag', "'s", 'spoken', 'written', '--', 'refer', 'natur', 'languag', '.', "'s", 'compon', 'artifici', 'intellig', '(', 'ai', ')', '.', 'nlp', 'exist', '50', 'year', 'root', 'field', 'linguist', '.', 'varieti', 'real-world', 'applic', 'numer', 'field', ',', 'includ', 'medic', 'research', ',', 'search', 'engin', 'busi', 'intellig', '.', 'nlp', 'use', 'either', 'rule-bas', 'machin', 'learn', 'approach', 'understand', 'structur', 'mean', 'text', '.', 'play', 'role', 'chatbot', ',', 'voic', 'assist', ',', 'text-bas', 'scan', 'program', ',', 'translat', 'applic', 'enterpris', 'softwar', 'aid', 'busi', 'oper', ',', 'increas', 'product', 'simplifi', 'differ', 'process', '.']


In [13]:
print("\nLemmatization:")
print(lemmatized_tokens)


Lemmatization:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'ability', 'computer', 'program', 'understand', 'human', 'language', "'s", 'spoken', 'written', '--', 'referred', 'natural', 'language', '.', "'s", 'component', 'artificial', 'intelligence', '(', 'AI', ')', '.', 'NLP', 'existed', '50', 'year', 'root', 'field', 'linguistics', '.', 'variety', 'real-world', 'application', 'numerous', 'field', ',', 'including', 'medical', 'research', ',', 'search', 'engine', 'business', 'intelligence', '.', 'NLP', 'us', 'either', 'rule-based', 'machine', 'learning', 'approach', 'understand', 'structure', 'meaning', 'text', '.', 'play', 'role', 'chatbots', ',', 'voice', 'assistant', ',', 'text-based', 'scanning', 'program', ',', 'translation', 'application', 'enterprise', 'software', 'aid', 'business', 'operation', ',', 'increase', 'productivity', 'simplifies', 'different', 'process', '.']


In [14]:
# Function to calculate Term Frequency (TF)
def calculate_tf(word, document):
  tokens = word_tokenize(document)
  return tokens.count(word) / len(tokens)

In [15]:
# Function to calculate Document Frequency (assuming a small document collection here)
def calculate_df(word, documents):
  df = 0
  for doc in documents:
    if calculate_tf(word, doc) > 0:
      df += 1
  return df

In [16]:
# Function to calculate TF-IDF
def calculate_tfidf(word, document, documents):
  tf = calculate_tf(word, document)
  df = calculate_df(word, documents)
  idf = 1 + np.log(len(documents) / (df + 1))  # Using NumPy for log (replace with appropriate library if not available)
  return tf * idf

In [17]:
# Document Collection (assuming this is a small collection for demonstration)
documents = [doc]

# TF-IDF Calculation (example for word "language")
word = "in"
tfidf = calculate_tfidf(word, doc, documents)

In [18]:

print("\nTF-IDF for word '{}':".format(word))
print(tfidf)


TF-IDF for word 'in':
0.00958915060750171
