## Task-1

<font face='georgia'>
    <h4><strong>1. Build a TFIDF Vectorizer & compare its results with Sklearn:</strong></h4>



### Corpus

In [None]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']




In [None]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [None]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output)

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 5)	0.5386476208856763
  (1, 3)	0.281088674033753
  (1, 1)	0.6876235979836938
  (2, 8)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 0)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045


In [None]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [None]:
# Write your code here.
# Make sure its well documented and readble with appropriate comments.
# Compare your results with thenbnm above sklearn tfidf vectorizer
# You are not supposed to use any other library apart from the ones given below

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [None]:
def fit(dataset):
    unique_words =set()
    if isinstance(dataset,list):
        for row in dataset:
            for word in row.split(" "):
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
vocab = fit(corpus)
print(vocab)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [None]:
def IDF(dataset,word):
    count = 0
    for row in dataset:
        if word in row:
            count = count+1
    return count

def transform(dataset,vocab):
    row = []
    column = []
    values = []
    for idx,document in enumerate(dataset):     # for each document in the dataset
        word_freq = dict(Counter(document.split()))
        
        # for every unique word in the document
        for word, freq in word_freq.items():
            col_index = vocab.get(word,-1)
            if col_index != -1:
                if len(word)<2:
                    continue
                column.append(col_index)
                row.append(idx)
                # computes TF value for each word, freq of each word / total words in a document
                # computes IDF value for each word=log(total no. of docus / no. of times a word is present in a doc via IDF()
                # then compute TF * IDF
                val = (freq/len(document.split()))*(1+math.log((1+len(dataset))/float(1+IDF(dataset,word))))
                values.append(val)  
    return normalize(csr_matrix(((values),(row,column)), shape=(len(dataset),len(vocab))),norm='l2')
tf_idf_vect = transform(corpus, vocab)
print(tf_idf_vect)

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


## Task-2

<font face='georgia'>
    <h4><strong>2. Implement max features functionality:</strong></h4>


In [None]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open("cleaned_strings", 'rb') as f:
    corpus1 = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus1))

Number of documents in corpus =  746


In [None]:
def fit(dataset):
    unique_words =set()
    if isinstance(dataset,list):
        for row in dataset:
            for word in row.split(" "):
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
vocab1 = fit(corpus1)
print(vocab1)



In [None]:
def IDF(dataset,word):
    count = 0
    for row in dataset:
        if word in row:
            count = count+1
    return count

def transform(dataset,vocab):
    row = []
    column = []
    values = []
    for idx,document in enumerate(dataset):     # for each document in the dataset
        word_freq = dict(Counter(document.split()))
        
        # for every unique word in the document
        for word, freq in word_freq.items():
            col_index = vocab.get(word,-1)
            if col_index != -1:
                if len(word)<2:
                    continue
                column.append(col_index)
                row.append(idx)
                # computes TF value for each word, freq of each word / total words in a document
                # computes IDF value for each word=log(total no. of docus / no. of times a word is present in a doc via IDF()
                # then compute TF * IDF
                val = (freq/len(document.split()))*(1+math.log((1+len(dataset))/float(1+IDF(dataset,word))))
                values.append(val)  
    return normalize(csr_matrix(((values),(row,column)), shape=(len(dataset),len(vocab))),norm='l2')
tf_idf_vect = transform(corpus1, vocab1)
print(tf_idf_vect)

  (0, 53)	0.4285381075435814
  (0, 688)	0.4285381075435814
  (0, 720)	0.4285381075435814
  (0, 1545)	0.22805947499067006
  (0, 1651)	0.1625302544476425
  (0, 1653)	0.3509903637504196
  (0, 2287)	0.342724587634745
  (0, 2878)	0.3605325019883844
  (1, 149)	0.340726435334222
  (1, 374)	0.24279750597070437
  (1, 966)	0.3812760870032998
  (1, 1132)	0.3238968093088598
  (1, 1511)	0.3513923888677652
  (1, 1676)	0.40499611264701524
  (1, 1712)	0.16034291156247021
  (1, 2446)	0.340726435334222
  (1, 2764)	0.3812760870032998
  (2, 20)	0.1635999234475438
  (2, 64)	0.22135060469585202
  (2, 89)	0.27150607787983744
  (2, 124)	0.28839706934051784
  (2, 145)	0.28839706934051784
  (2, 201)	0.27150607787983744
  (2, 236)	0.22135060469585202
  (2, 320)	0.21375539663152918
  :	:
  (741, 268)	0.3817659198590297
  (741, 429)	0.4400030233500667
  (741, 1096)	0.31194092291638165
  (741, 1354)	0.4400030233500667
  (741, 1422)	0.4142326945215713
  (741, 2471)	0.331653579641547
  (741, 2785)	0.29691665468648404

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tf_idf_vectorizer = TfidfVectorizer()
method_tfidf = tf_idf_vectorizer.fit_transform(corpus1)
method_tfidf.get_shape()

print(method_tfidf)

  (0, 1545)	0.3056602689480387
  (0, 2878)	0.3578114562231773
  (0, 720)	0.41239438707788106
  (0, 688)	0.41239438707788106
  (0, 1651)	0.1619231790584802
  (0, 53)	0.41239438707788106
  (0, 1653)	0.3578114562231773
  (0, 2287)	0.33776799164675547
  (1, 2764)	0.3766212179754885
  (1, 1132)	0.3276587847617171
  (1, 1676)	0.4000516539584004
  (1, 149)	0.3365666231014132
  (1, 374)	0.23983327446209926
  (1, 966)	0.3766212179754885
  (1, 1511)	0.34710235966502895
  (1, 2446)	0.3599970590843251
  (1, 1712)	0.16744040526541595
  (2, 853)	0.28282863381171675
  (2, 1704)	0.2453945264455564
  (2, 64)	0.21707671739759152
  (2, 1482)	0.22619301571578695
  (2, 1873)	0.17804028847989228
  (2, 1889)	0.23164830224699004
  (2, 20)	0.162536633636425
  (2, 2085)	0.2379459683681101
  :	:
  (741, 429)	0.4312119115047646
  (741, 1354)	0.4312119115047646
  (741, 1422)	0.4059564651452501
  (741, 268)	0.37413836567848674
  (741, 1096)	0.3375265480660245
  (741, 2785)	0.29098436017585044
  (741, 2471)	0.353180

### Limiting the number of features generated to 50 (Considering 50 words with top Idf score)

In [None]:
#Referred-->https://stackoverflow.com/questions/34232190/scikit-learn-tfidfvectorizer-how-to-get-top-n-terms-with-highest-tf-idf-score
def IDF(corpus, unique_words):
    idf_dict = {}
    n = len(corpus)
    for word in unique_words:
        count = 0
        for row in corpus:
            if word in row.split(" "):
                count = count + 1
            idf_dict[word] = (math.log((1 + n)/(count + 1))) + 1
    return idf_dict


# Implementation of the fit() method
def fit(corpus):
    unique_words = []
    for row in corpus:
        for word in row.split(" "):
            if len(word) >= 2 and word not in unique_words:
                unique_words.append(word)
    
    # Sorting and printing the IDF(corpus, unique_words) lexicographically
    sort_idf_lex = {val[0] : val[1] for val in sorted(IDF(corpus1,unique_words).items(), key = lambda kv:(-kv[1], kv[0]))}
    print(sort_idf_lex)

    # sorting the IDF(corpus, unique_words).items() based on IDF values in descending order
    top_idf = dict(sorted(IDF(corpus, unique_words).items(), key = operator.itemgetter(1), reverse = True))
   
    # Generate a dictonary of word with the consecutive number to every item in the text
    vocabulary = {j : i for i, j in enumerate(top_idf)}
    idf_values_unique_words = IDF(corpus, top_idf)
    return vocabulary, idf_values_unique_words
Vocabulary, idf_vocab = fit(corpus1)




In [None]:
print(idf_vocab)



In [None]:
top50words = {k: Vocabulary[k] for k in list(Vocabulary)[:50]}
top_words = list(top50words.keys())

top_idf_scores = list(idf_vocab.values())[:50]
vocab_ = dict(zip(top_words,top_idf_scores))
print(vocab_)

{'aimless': 6.922918004572872, 'distressed': 6.922918004572872, 'drifting': 6.922918004572872, 'nearly': 6.922918004572872, 'attempting': 6.922918004572872, 'artiness': 6.922918004572872, 'existent': 6.922918004572872, 'gerardo': 6.922918004572872, 'emptiness': 6.922918004572872, 'effort': 6.922918004572872, 'messages': 6.922918004572872, 'buffet': 6.922918004572872, 'science': 6.922918004572872, 'teacher': 6.922918004572872, 'baby': 6.922918004572872, 'owls': 6.922918004572872, 'florida': 6.922918004572872, 'muppets': 6.922918004572872, 'person': 6.922918004572872, 'overdue': 6.922918004572872, 'screenplay': 6.922918004572872, 'post': 6.922918004572872, 'practically': 6.922918004572872, 'structure': 6.922918004572872, 'tightly': 6.922918004572872, 'constructed': 6.922918004572872, 'vitally': 6.922918004572872, 'occurs': 6.922918004572872, 'content': 6.922918004572872, 'fill': 6.922918004572872, 'dozen': 6.922918004572872, 'highest': 6.922918004572872, 'superlative': 6.922918004572872,

In [None]:
#Referred-->https://github.com/vennela28/AppliedAI/blob/master/Assignments/3_CountVectorizer/Assignment_3_Instructions

# Implementation of transform() method

def transform(dataset, vocabulary, idf_values):
    sparse_matrix = csr_matrix((len(dataset), len(vocabulary)), dtype = np.float64)
    for row  in range(0, len(dataset)):
        number_of_words_in_row = Counter(dataset[row].split())
        for word in dataset[row].split():
            if word in list(vocabulary.keys()):
                tf_idf_value = (number_of_words_in_row[word] / len(dataset[row].split())) * (idf_values[word])
                sparse_matrix[row, vocabulary[word]] = tf_idf_value
    print("Normalized Spare Matrix\n", normalize(sparse_matrix, norm = 'l2', axis = 1, copy = True, return_norm = False))
    output = normalize(sparse_matrix, norm = 'l2', axis = 1, copy = True, return_norm = False)
    return output
final_output = transform(corpus1, Vocabulary,idf_vocab)

# shape of tfidf vectorizer output after applying transform method.
print("Shape of TF-idf vactorizer",final_output.shape) 

  self._set_intXint(row, col, x.flat[0])


Normalized Spare Matrix
   (0, 0)	0.4123943870778812
  (0, 1)	0.4123943870778812
  (0, 2)	0.4123943870778812
  (0, 2511)	0.35781145622317734
  (0, 2512)	0.35781145622317734
  (0, 2678)	0.3377679916467555
  (0, 2800)	0.30566026894803877
  (0, 2885)	0.16192317905848022
  (1, 3)	0.40005165395840037
  (1, 1855)	0.3766212179754885
  (1, 1856)	0.3766212179754885
  (1, 2314)	0.35999705908432506
  (1, 2513)	0.3471023596650289
  (1, 2615)	0.33656662310141316
  (1, 2679)	0.3276587847617171
  (1, 2872)	0.23983327446209923
  (1, 2883)	0.16744040526541595
  (2, 4)	0.28282863381171675
  (2, 5)	0.28282863381171675
  (2, 6)	0.28282863381171675
  (2, 1857)	0.26626377741607504
  (2, 1858)	0.26626377741607504
  (2, 2514)	0.24539452644555643
  (2, 2616)	0.23794596836811016
  (2, 2617)	0.23794596836811016
  :	:
  (741, 1850)	0.43121191150476473
  (741, 1851)	0.43121191150476473
  (741, 2313)	0.40595646514525013
  (741, 2586)	0.3741383656784868
  (741, 2688)	0.35318031934226296
  (741, 2761)	0.3375265480660

In [None]:
# Sparse matrix representation for the first document in the corpus1

print(final_output[0])

  (0, 0)	0.4123943870778812
  (0, 1)	0.4123943870778812
  (0, 2)	0.4123943870778812
  (0, 2511)	0.35781145622317734
  (0, 2512)	0.35781145622317734
  (0, 2678)	0.3377679916467555
  (0, 2800)	0.30566026894803877
  (0, 2885)	0.16192317905848022


In [None]:
#Dense matrix representation for the first document in the corpus1
print(final_output[0].toarray())


[[0.41239439 0.41239439 0.41239439 ... 0.         0.         0.16192318]]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(norm = 'l2', max_features = 50)
method_tfidf = tf_idf_vectorizer.fit_transform(corpus1)

print(method_tfidf.get_shape())
print(method_tfidf[:50])

(746, 50)
  (0, 25)	1.0
  (1, 8)	0.8199426324888012
  (1, 30)	0.572445699981522
  (2, 32)	0.5699230219852334
  (2, 0)	0.5202944244602254
  (2, 10)	0.5273651853196742
  (2, 25)	0.35548195762874873
  (3, 20)	1.0
  (4, 4)	0.859534997767905
  (4, 25)	0.5110768901174535
  (5, 25)	1.0
  (7, 17)	0.9412322291499969
  (7, 25)	0.33776010837475723
  (11, 21)	0.5533799348242362
  (11, 23)	0.5366405664621315
  (11, 4)	0.5475363087512142
  (11, 25)	0.32556342048857684
  (12, 4)	1.0
  (14, 25)	1.0
  (15, 37)	0.7118629038754104
  (15, 0)	0.7023184506234106
  (16, 11)	0.6478335937139895
  (16, 14)	0.4007783298922675
  (16, 23)	0.6478335937139895
  (17, 14)	0.4986460786244059
  :	:
  (38, 31)	0.36065823271591674
  (38, 21)	0.46637408670079394
  (38, 17)	0.38230065758919324
  (38, 25)	0.27437630701549015
  (39, 41)	0.8498083336390599
  (39, 30)	0.5270918288852563
  (40, 3)	0.5520268346392149
  (40, 43)	0.7192311074525178
  (40, 14)	0.42186844858419265
  (41, 3)	0.6737177870807189
  (41, 0)	0.738988730205