<!-- Text Analytics

1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document
Frequency. -->

In [10]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
sample_document = """
Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages. 
As such, NLP is related to the area of human–computer interaction. 
Many challenges in NLP involve natural language understanding, that is, enabling computers to derive meaning from human or natural language input, and others involve natural language generation. 
"""


In [3]:
#Download the below files if error occurs

# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [4]:
#Tokenization
tokens = word_tokenize(sample_document)


In [5]:
#POS Tagging
pos_tags = pos_tag(tokens)


In [6]:
#Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]


In [7]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]


In [11]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]


In [12]:
corpus = [sample_document]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())


In [13]:
tfidf_df

Unnamed: 0,and,area,artificial,as,between,challenges,computational,computer,computers,concerned,...,others,processing,related,science,such,that,the,to,understanding,with
0,0.262111,0.08737,0.08737,0.08737,0.08737,0.08737,0.08737,0.174741,0.174741,0.08737,...,0.08737,0.08737,0.08737,0.08737,0.08737,0.08737,0.174741,0.174741,0.08737,0.08737
