# Text Analysis
Name: Sourabh Maniyar<br>
Roll no.: 31251

## Import Python Modules

In [1]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pandas as pd
import numpy as np
import nltk

## Extract Text from Document

In [2]:
text = open('text_doc.txt').read()
text

'Hello everyone! This is a text analysis assignment. We will be using preprocessing methods like Tokenization, POS tagging , stop words removal, stemming and lemmatization.'

## Tokenization
Tokenization is the process of breaking down the given text in natural language processing into the smallest unit in a sentence called a token.

In [3]:
tokens_sent = nltk.sent_tokenize(text)
tokens_sent

['Hello everyone!',
 'This is a text analysis assignment.',
 'We will be using preprocessing methods like Tokenization, POS tagging , stop words removal, stemming and lemmatization.']

In [4]:
tokens_words = nltk.word_tokenize(text)
print(tokens_words)

['Hello', 'everyone', '!', 'This', 'is', 'a', 'text', 'analysis', 'assignment', '.', 'We', 'will', 'be', 'using', 'preprocessing', 'methods', 'like', 'Tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', 'and', 'lemmatization', '.']


## POS Tagging
Part of Speech Tagging (POS-Tag) is the labeling of the words in a text according to their word types (noun, adjective, adverb, verb, etc.). <br>
List of tags: https://www.guru99.com/pos-tagging-chunking-nltk.html

In [5]:
tagged = nltk.pos_tag(tokens_words)
print(tagged)

[('Hello', 'NNP'), ('everyone', 'NN'), ('!', '.'), ('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('text', 'JJ'), ('analysis', 'NN'), ('assignment', 'NN'), ('.', '.'), ('We', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('using', 'VBG'), ('preprocessing', 'VBG'), ('methods', 'NNS'), ('like', 'IN'), ('Tokenization', 'NNP'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]


## Stop Words Removal
It is a process of removing words which don't have strong meaning like articles, 'and', 'it's'.

In [6]:
from nltk.corpus import stopwords

In [7]:
stop_words = stopwords.words('english')
print(stop_words[:10])
len(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


179

In [8]:
tokens_words_lowercase = []
for word in tokens_words:
    tokens_words_lowercase.append(word.lower())
print(tokens_words_lowercase)

['hello', 'everyone', '!', 'this', 'is', 'a', 'text', 'analysis', 'assignment', '.', 'we', 'will', 'be', 'using', 'preprocessing', 'methods', 'like', 'tokenization', ',', 'pos', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', 'and', 'lemmatization', '.']


In [9]:
clean_token = []
for i in tokens_words_lowercase:
    if i not in stop_words:
        clean_token.append(i)
print(clean_token)

['hello', 'everyone', '!', 'text', 'analysis', 'assignment', '.', 'using', 'preprocessing', 'methods', 'like', 'tokenization', ',', 'pos', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', 'lemmatization', '.']


## Stemming
Stemming is the process of finding the root of words.

In [10]:
from nltk.stem import PorterStemmer

In [11]:
stemmer = PorterStemmer()
stem_token = []
for word in tokens_words_lowercase:
    stem_token.append(stemmer.stem(word))
print(stem_token)

['hello', 'everyon', '!', 'thi', 'is', 'a', 'text', 'analysi', 'assign', '.', 'we', 'will', 'be', 'use', 'preprocess', 'method', 'like', 'token', ',', 'po', 'tag', ',', 'stop', 'word', 'remov', ',', 'stem', 'and', 'lemmat', '.']


## Lemmatization
Lemmatization is the process of finding the form of the related word in the dictionary. It is different from Stemming.

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
lemmatized_token = []
for word in tokens_words_lowercase:
    lemmatized_token.append(lemmatizer.lemmatize(word))
print(lemmatized_token)

['hello', 'everyone', '!', 'this', 'is', 'a', 'text', 'analysis', 'assignment', '.', 'we', 'will', 'be', 'using', 'preprocessing', 'method', 'like', 'tokenization', ',', 'po', 'tagging', ',', 'stop', 'word', 'removal', ',', 'stemming', 'and', 'lemmatization', '.']


## Term Frequency and Inverse Document Frequency

### Using Formulas

In [14]:
corpus = tokens_sent
corpus # contains multiple documents

['Hello everyone!',
 'This is a text analysis assignment.',
 'We will be using preprocessing methods like Tokenization, POS tagging , stop words removal, stemming and lemmatization.']

In [15]:
words_set = set()
for doc in corpus:
    words = doc.split(" ")
    words_set = words_set.union(set(words))
print("Number of words in the corpus: ", len(words_set))
print("Words in the corpus: ",words_set)

Number of words in the corpus:  25
Words in the corpus:  {'Tokenization,', 'This', 'text', 'everyone!', 'be', 'analysis', ',', 'stop', 'removal,', 'stemming', 'Hello', 'We', 'like', 'words', 'tagging', 'assignment.', 'POS', 'preprocessing', 'methods', 'and', 'is', 'a', 'will', 'lemmatization.', 'using'}


<b>Term Frequency</b>:<br>


    TF(term,document) = count of term in document / number of words in document

In [16]:
n_docs = len(corpus)
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)

for i in range(n_docs):
    words = corpus[i].split(" ")
    for word in words:
        df_tf[word][i] = df_tf[word][i] + (1/len(words))
df_tf

Unnamed: 0,"Tokenization,",This,text,everyone!,be,analysis,",",stop,"removal,",stemming,...,assignment.,POS,preprocessing,methods,and,is,a,will,lemmatization.,using
0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.166667,0.166667,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,...,0.166667,0.0,0.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0
2,0.058824,0.0,0.0,0.0,0.058824,0.0,0.058824,0.058824,0.058824,0.058824,...,0.0,0.058824,0.058824,0.058824,0.058824,0.0,0.0,0.058824,0.058824,0.058824


<b>Inverse Document Frequency</b>:<br>
    Document Frequency : df(term) = occurence of term in document<br>
    
    
    idf(term) = log(count of corpus / (df + 1))

In [17]:
print("IDF of: ")

idf = {}

for word in words_set:
    k = 0
    for i in range(n_docs):
        if word in corpus[i].split():
            k += 1
    idf[word] = np.log10(n_docs / k)
    print("{}: {}".format(word, idf[word]))

IDF of: 
Tokenization,: 0.47712125471966244
This: 0.47712125471966244
text: 0.47712125471966244
everyone!: 0.47712125471966244
be: 0.47712125471966244
analysis: 0.47712125471966244
,: 0.47712125471966244
stop: 0.47712125471966244
removal,: 0.47712125471966244
stemming: 0.47712125471966244
Hello: 0.47712125471966244
We: 0.47712125471966244
like: 0.47712125471966244
words: 0.47712125471966244
tagging: 0.47712125471966244
assignment.: 0.47712125471966244
POS: 0.47712125471966244
preprocessing: 0.47712125471966244
methods: 0.47712125471966244
and: 0.47712125471966244
is: 0.47712125471966244
a: 0.47712125471966244
will: 0.47712125471966244
lemmatization.: 0.47712125471966244
using: 0.47712125471966244


<b>TF-IDF</b>: <br>
    
    tf-idf(term, document) = tf(term, document) * idf(term)

In [18]:
df_tf_idf = df_tf.copy()

for word in words_set:
    for i in range(n_docs):
        df_tf_idf[word][i] = df_tf[word][i] * idf[word]
df_tf_idf

Unnamed: 0,"Tokenization,",This,text,everyone!,be,analysis,",",stop,"removal,",stemming,...,assignment.,POS,preprocessing,methods,and,is,a,will,lemmatization.,using
0,0.0,0.0,0.0,0.238561,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.07952,0.07952,0.0,0.0,0.07952,0.0,0.0,0.0,0.0,...,0.07952,0.0,0.0,0.0,0.0,0.07952,0.07952,0.0,0.0,0.0
2,0.028066,0.0,0.0,0.0,0.028066,0.0,0.028066,0.028066,0.028066,0.028066,...,0.0,0.028066,0.028066,0.028066,0.028066,0.0,0.0,0.028066,0.028066,0.028066


### Using In-built Functions from Scikit-learn

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(corpus) # vectorizing the corpus by the function, a sparse matrix is obtained.

In [21]:
print(type(tf_idf_vector), tf_idf_vector.shape)

<class 'scipy.sparse.csr.csr_matrix'> (3, 23)


In [22]:
tf_idf_array = tf_idf_vector.toarray()

print(tf_idf_array) # convert to an regular array to get a better idea of the values

[[0.         0.         0.         0.         0.70710678 0.70710678
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.4472136  0.         0.4472136  0.         0.         0.
  0.4472136  0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.4472136  0.4472136
  0.         0.         0.         0.         0.        ]
 [0.         0.25       0.         0.25       0.         0.
  0.         0.25       0.25       0.25       0.25       0.25
  0.25       0.25       0.25       0.25       0.         0.
  0.25       0.25       0.25       0.25       0.25      ]]


In [23]:
words_set = tr_idf_model.get_feature_names() # obtain the original terms in the corpus by using get_feature_names

print(words_set)

['analysis', 'and', 'assignment', 'be', 'everyone', 'hello', 'is', 'lemmatization', 'like', 'methods', 'pos', 'preprocessing', 'removal', 'stemming', 'stop', 'tagging', 'text', 'this', 'tokenization', 'using', 'we', 'will', 'words']


In [24]:
df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set) # TF-IDF scores of each document

df_tf_idf

Unnamed: 0,analysis,and,assignment,be,everyone,hello,is,lemmatization,like,methods,...,stemming,stop,tagging,text,this,tokenization,using,we,will,words
0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.447214,0.0,0.447214,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,...,0.0,0.0,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0
2,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.25,0.25,0.25,...,0.25,0.25,0.25,0.0,0.0,0.25,0.25,0.25,0.25,0.25
