# Task-1
# 1. Build a TFIDF Vectorizer & compare its results with Sklearn:¶


In [1]:
from collections import Counter #CALCULATING FREQUENCY
from tqdm import tqdm
from scipy.sparse import csr_matrix #CREATING SPARSE MATRIX
import math
import operator
from sklearn.preprocessing import normalize
import numpy



# FUNCTION FOR FIND INVERSE DOCUMENT FREQUENCY

In [2]:
#CALLED INSIDE FIT FUNCTION
def IDF(dataset, uniquewords):
    idf_dict={}
    N=len(dataset)
    for word in uniquewords:
        count=0
        for idx, row in enumerate(dataset):
            if (word in row.split(" ")):      
                count=count+1
            idf_dict[word]=1+(math.log((1+N)/(1+count)))#IDF FORMULA
    return idf_dict

# Fit Function

In [3]:
#CALLED FROM DRIVER CODE
#RETURN VOCABULARY AND IDF VALUE OF EACH UNIQUE WORD
def fit(dataset):
    uniquewords = set()            #SET DOESNT ALLOW DUPLICATE ITEMS
    if isinstance(dataset,(list,)):     #CHECKS IF DATASET IS LIST
        for row in dataset:
            for word in row.split(" "):
                if len(word)<2:
                    continue
                uniquewords.add(word)
        uniquewords = sorted(list(uniquewords))
        vocab = {j:i for i,j in enumerate(uniquewords)}
        Idf_values=IDF(dataset,uniquewords)      #FUNCTION CALL
        return vocab,Idf_values
    else:
        print("you need to pass list of sentance")


# Transform function

In [4]:
def transform(dataset, vocab, Idf_values):
    rows=[]
    columns=[]
    values=[]
    for idx, row in enumerate(dataset):
        word_freq=dict(Counter(row.split()))  #COUNTER FOR FINDING FREQUENCY
        for word,freq in word_freq.items():
            tf=(freq/len(row.split()))    #TERM FREQUENCY
            col_index=vocab.get(word,-1)
            if col_index !=-1:
                rows.append(idx)
                columns.append(col_index)
                values.append(Idf_values[word]*(tf)) #TF-IDF VALUE CALCULATED AND APPENDED
    return normalize(csr_matrix( ((values),(rows,columns)), shape=(len(dataset),len(vocab))),norm='l2' )
                       #RETURNING NORMALIZED MATRIX AS OUTPUT

# Driver code for custom implementation

In [5]:
#DRIVER CODE FOR CUSTOM IMPLEMENTATION
dataset=['this is the first document','this document is the second document','and this is the third one','is this the first document']

vocab, Idf_values = fit(dataset)
x1=transform(dataset, vocab,Idf_values)
print(x1[0].toarray())
print(vocab.keys())
print(Idf_values.values())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
dict_keys(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'])
dict_values([1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0])


# sci-kit learn implementation

In [6]:
#SKLEARN IMPLEMENTATION
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(dataset)
skl_output = vectorizer.transform(dataset)
print(skl_output[0].toarray())
print(vectorizer.get_feature_names())
print(vectorizer.idf_)

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


# we get same output on both custom and scikit learn implementation

# Task-2
# 2. Implement max features functionality

In [7]:
import pickle
with open('cleaned_strings', 'rb') as f:
    dataset = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in dataset = ",len(dataset))

Number of documents in dataset =  746


# Libraries

In [8]:
from collections import Counter #CALCULATING FREQUENCY
from tqdm import tqdm
from scipy.sparse import csr_matrix #CREATING SPARSE MATRIX
import math
import operator
from sklearn.preprocessing import normalize
import numpy

# Idf calculation

In [9]:
#FUNCTION FOR FIND INVERSE DOCUMENT FREQUENCY
#CALLED INSIDE FIT FUNCTION
def IDF(dataset, uniquewords):
    idf_dict={}
    N=len(dataset)
    for word in uniquewords:
        count=0
        for idx, row in enumerate(dataset):
            if (word in row.split(" ")):      
                count=count+1
            idf_dict[word]=1+(math.log((1+N)/(1+count)))#IDF FORMULA
    return idf_dict

# Fit Function and transform function

In [10]:

#CALLED FROM DRIVER CODE
#RETURN VOCABULARY AND IDF VALUE OF EACH UNIQUE WORD
def fit(dataset):
    uniquewords = set()            #SET DOESNT ALLOW DUPLICATE ITEMS
    if isinstance(dataset,(list,)):     #CHECKS IF DATASET IS LIST
        for row in dataset:
            for word in row.split(" "):
                if len(word)<2:
                    continue
                uniquewords.add(word)
        uniquewords = sorted(list(uniquewords))
        vocab = {j:i for i,j in enumerate(uniquewords)}
        
        Idf_values=IDF(dataset,uniquewords)      #FUNCTION CALL
        Idf_values = dict(sorted(Idf_values.items(), key = operator.itemgetter(1), reverse = True)[:50])
        result = {key: vocab[key] for key in Idf_values if key in vocab}
        vocab = {j:i for i,j in enumerate(result)}
        return vocab,Idf_values
    else:
        print("you need to pass list of sentance")

        
def transform(dataset, vocab, Idf_values):
    rows=[]
    columns=[]
    values=[]
    for idx, row in enumerate(dataset):
        word_freq=dict(Counter(row.split()))  #COUNTER FOR FINDING FREQUENCY
        for word,freq in word_freq.items():
            col_index=vocab.get(word,-1)
            if col_index !=-1:
                rows.append(idx)
                columns.append(col_index)
                values.append(Idf_values[word]*(freq/len(row.split()))) #TF-IDF VALUE CALCULATED AND APPENDED
    return normalize(csr_matrix( ((values),(rows,columns)), shape=(len(dataset),len(vocab))),norm='l2' )
                       #RETURNING NORMALIZED MATRIX AS OUTPUT


# DRIVER CODE FOR CUSTOM IMPLEMENTATION


In [11]:
vocab, Idf_values = fit(dataset)
x1=transform(dataset, vocab,Idf_values)
x2=x1[0].toarray()
print(x2)
print(x2.shape)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
(1, 50)


# Vocabulary

In [12]:
print("List of words in vocabulary")
print(vocab.keys())

print(Idf_values.values())


List of words in vocabulary
dict_keys(['aailiyah', 'abandoned', 'abroad', 'abstruse', 'academy', 'accents', 'accessible', 'acclaimed', 'accolades', 'accurate', 'accurately', 'achille', 'ackerman', 'actions', 'adams', 'add', 'added', 'admins', 'admiration', 'admitted', 'adrift', 'adventure', 'aesthetically', 'affected', 'affleck', 'afternoon', 'aged', 'ages', 'agree', 'agreed', 'aimless', 'aired', 'akasha', 'akin', 'alert', 'alike', 'allison', 'allow', 'allowing', 'alongside', 'amateurish', 'amaze', 'amazed', 'amazingly', 'amusing', 'amust', 'anatomist', 'angel', 'angela', 'angelina'])
dict_values([6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.9229180045728

# the shape of dense matrix output is 1,50
## Vocabulary of 50 words with top idf scores is also printed in descending order