# Text Analytics
- 	Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
- 	Create representation of document by calculating Term Frequency and Inverse Document Frequency


In [11]:
import nltk,re

In [12]:
with open('doc.txt', 'r') as file:
    doc = file.read()


In [13]:
doc

"Mumbai Indians,the IPL (Indian Premier League) cricket team. The Mumbai Indians (MI) are one of the most successful and popular teams in the IPL, owned by Mukesh Ambani's Reliance Industries. They have won the IPL title multiple times, with star players like Rohit Sharma, Jasprit Bumrah, and Kieron Pollard being part of their squad."

# Tokenization
Tokenization is the process of breaking down a text into individual words or tokens. This is often the first step in natural language processing tasks

In [14]:
word_tokens = nltk.word_tokenize(doc)
print(word_tokens)

['Mumbai', 'Indians', ',', 'the', 'IPL', '(', 'Indian', 'Premier', 'League', ')', 'cricket', 'team', '.', 'The', 'Mumbai', 'Indians', '(', 'MI', ')', 'are', 'one', 'of', 'the', 'most', 'successful', 'and', 'popular', 'teams', 'in', 'the', 'IPL', ',', 'owned', 'by', 'Mukesh', 'Ambani', "'s", 'Reliance', 'Industries', '.', 'They', 'have', 'won', 'the', 'IPL', 'title', 'multiple', 'times', ',', 'with', 'star', 'players', 'like', 'Rohit', 'Sharma', ',', 'Jasprit', 'Bumrah', ',', 'and', 'Kieron', 'Pollard', 'being', 'part', 'of', 'their', 'squad', '.']


# Stop Words
Stop words are common words like 'the', 'is', 'and', etc., which often do not carry significant meaning in text analysis. Remove these stop words from the text to focus on the more meaningful content.

In [15]:
stop_words = set(nltk.corpus.stopwords.words('english'))
word_tokens = [token for token in word_tokens if token not in stop_words]
print(word_tokens)

['Mumbai', 'Indians', ',', 'IPL', '(', 'Indian', 'Premier', 'League', ')', 'cricket', 'team', '.', 'The', 'Mumbai', 'Indians', '(', 'MI', ')', 'one', 'successful', 'popular', 'teams', 'IPL', ',', 'owned', 'Mukesh', 'Ambani', "'s", 'Reliance', 'Industries', '.', 'They', 'IPL', 'title', 'multiple', 'times', ',', 'star', 'players', 'like', 'Rohit', 'Sharma', ',', 'Jasprit', 'Bumrah', ',', 'Kieron', 'Pollard', 'part', 'squad', '.']


# POS Tagging
POS tagging involves labeling each word in a sentence with its corresponding part of speech, such as noun, verb, adjective, etc.

In [18]:
tagged = nltk.pos_tag(word_tokens)
print(tagged)

[('Mumbai', 'NNP'), ('Indians', 'NNPS'), (',', ','), ('IPL', 'NNP'), ('(', '('), ('Indian', 'JJ'), ('Premier', 'NNP'), ('League', 'NNP'), (')', ')'), ('cricket', 'NN'), ('team', 'NN'), ('.', '.'), ('The', 'DT'), ('Mumbai', 'NNP'), ('Indians', 'NNPS'), ('(', '('), ('MI', 'NNP'), (')', ')'), ('one', 'CD'), ('successful', 'JJ'), ('popular', 'JJ'), ('teams', 'NNS'), ('IPL', 'NNP'), (',', ','), ('owned', 'VBD'), ('Mukesh', 'NNP'), ('Ambani', 'NNP'), ("'s", 'POS'), ('Reliance', 'NNP'), ('Industries', 'NNPS'), ('.', '.'), ('They', 'PRP'), ('IPL', 'NNP'), ('title', 'NN'), ('multiple', 'JJ'), ('times', 'NNS'), (',', ','), ('star', 'NN'), ('players', 'NNS'), ('like', 'IN'), ('Rohit', 'NNP'), ('Sharma', 'NNP'), (',', ','), ('Jasprit', 'NNP'), ('Bumrah', 'NNP'), (',', ','), ('Kieron', 'NNP'), ('Pollard', 'NNP'), ('part', 'NN'), ('squad', 'NN'), ('.', '.')]


# Lemmatization


In [19]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in word_tokens]
print(lemmatized_tokens)

['Mumbai', 'Indians', ',', 'IPL', '(', 'Indian', 'Premier', 'League', ')', 'cricket', 'team', '.', 'The', 'Mumbai', 'Indians', '(', 'MI', ')', 'one', 'successful', 'popular', 'team', 'IPL', ',', 'owned', 'Mukesh', 'Ambani', "'s", 'Reliance', 'Industries', '.', 'They', 'IPL', 'title', 'multiple', 'time', ',', 'star', 'player', 'like', 'Rohit', 'Sharma', ',', 'Jasprit', 'Bumrah', ',', 'Kieron', 'Pollard', 'part', 'squad', '.']


# Stemming

In [20]:
stemmer = nltk.stem.PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in word_tokens]
print(stemmed_tokens)

['mumbai', 'indian', ',', 'ipl', '(', 'indian', 'premier', 'leagu', ')', 'cricket', 'team', '.', 'the', 'mumbai', 'indian', '(', 'mi', ')', 'one', 'success', 'popular', 'team', 'ipl', ',', 'own', 'mukesh', 'ambani', "'s", 'relianc', 'industri', '.', 'they', 'ipl', 'titl', 'multipl', 'time', ',', 'star', 'player', 'like', 'rohit', 'sharma', ',', 'jasprit', 'bumrah', ',', 'kieron', 'pollard', 'part', 'squad', '.']


# TF-IDF

In [21]:
def calculate_term_freq(doc):
    word_tokens = nltk.word_tokenize(doc)
    tf_dict = dict()
    for word in word_tokens:
        tf_dict[word] = word_tokens.count(word)
    tf = dict()
    for word, count in tf_dict.items():
        tf[word] = count/len(tf_dict)
    return tf

In [23]:
tf = calculate_term_freq(doc)
print("Term Frequency of Doc : ", tf, "\n\n")


Term Frequency of Doc :  {'Mumbai': 0.0392156862745098, 'Indians': 0.0392156862745098, ',': 0.09803921568627451, 'the': 0.0784313725490196, 'IPL': 0.058823529411764705, '(': 0.0392156862745098, 'Indian': 0.0196078431372549, 'Premier': 0.0196078431372549, 'League': 0.0196078431372549, ')': 0.0392156862745098, 'cricket': 0.0196078431372549, 'team': 0.0196078431372549, '.': 0.058823529411764705, 'The': 0.0196078431372549, 'MI': 0.0196078431372549, 'are': 0.0196078431372549, 'one': 0.0196078431372549, 'of': 0.0392156862745098, 'most': 0.0196078431372549, 'successful': 0.0196078431372549, 'and': 0.0392156862745098, 'popular': 0.0196078431372549, 'teams': 0.0196078431372549, 'in': 0.0196078431372549, 'owned': 0.0196078431372549, 'by': 0.0196078431372549, 'Mukesh': 0.0196078431372549, 'Ambani': 0.0196078431372549, "'s": 0.0196078431372549, 'Reliance': 0.0196078431372549, 'Industries': 0.0196078431372549, 'They': 0.0196078431372549, 'have': 0.0196078431372549, 'won': 0.0196078431372549, 'title