Co-Occurrence matrix on TFIDF text vector

In [0]:
import numpy as np
from tqdm import tqdm
from collections import Counter 
from math import log

In [0]:
def get_features(feature_names, tf_idf, top_features):
    features_dict = dict.fromkeys(Counter(feature_names),0)
    for key, index in zip(feature_names, range(len(feature_names))):
        features_dict[key] = max(tf_idf[index])
    if top_features is not True:
        feature_names = []
        for i in range(top_features):
            temp = max(features_dict, key=features_dict.get)
            del features_dict[temp]
            feature_names.append(temp)
    return  feature_names

In [0]:
def tfidf_vector(doc_list, top_features = False):
    feature_names = list(set(' '.join(doc_list).split()))  # getting all the unique words in all documents
    no_of_docs = len(doc_list)
    # creating the numpy array where 
    # rows=No. of unique words in all document
    # columns = No. of documents
    tf_idf = np.full(((len(feature_names)), (no_of_docs)),0, dtype=object)  # creating the numpy array where rows=No. of unique words in all document
    
    word_count = [len(i.split()) for i in doc_list]        # counting number of words in each document
    word_counter = [Counter(i.split()) for i in doc_list]  # getting no. of times a word appeared in each document
    # calculating tfidf value for each word corresponding to each document 
    for name, word_index in zip(feature_names, range(len(feature_names))):         
        term_freq = 0
        for document in doc_list:                           # getting, in how may documents a word is present
            if name in document.split():
                term_freq+=1
        idf_value = log(no_of_docs)/(log(term_freq)+1)      # calculating idf value for the word
        for doc_no in range(len(doc_list)):
            tf_idf[word_index][doc_no] = ((word_counter[doc_no][name])/word_count[doc_no])*idf_value  
    if top_features is not False:
        return tf_idf, get_features(feature_names, tf_idf, top_features)

    return tf_idf
            

In [30]:
a = ["abc def ijk pqr", "pqr klm opq", "lmn pqr xyz abc def pqr abc", 'go goa gon', 'aaja, fdads']
import pandas as pd
tfidf, feature_names = tfidf_vector(a, top_features=3) 
pd.DataFrame(tfidf_vector(a))

Unnamed: 0,0,1,2,3,4
0,0.0,0.536479,0.0,0.0,0.0
1,0.402359,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.536479,0.0
3,0.23764,0.0,0.271589,0.0,0.0
4,0.0,0.536479,0.0,0.0,0.0
5,0.0,0.0,0.22992,0.0,0.0
6,0.0,0.0,0.22992,0.0,0.0
7,0.191726,0.255635,0.219116,0.0,0.0
8,0.0,0.0,0.0,0.536479,0.0
9,0.0,0.0,0.0,0.0,0.804719


In [31]:
word=[]
for i in tqdm(a):
    temp = []
    for s in i.split():
        temp.append(s)
    word.append(temp)

100%|██████████| 5/5 [00:00<00:00, 8651.62it/s]


In [0]:
def getallindex(list, string):
    '''returns all the index if word is in the list'''
    return filter(lambda a: list[a]==string, range(0,len(list)))
     

In [33]:
co_mat = np.full((4,4),-1, dtype=object)
for i in range(0,4):
    if i==0:
        co_mat[i,i]=0
        continue
    co_mat[0,i]=feature_names[(i-1)]
    co_mat[i,0]=feature_names[(i-1)]
co_mat

array([[0, 'aaja,', 'fdads', 'opq'],
       ['aaja,', -1, -1, -1],
       ['fdads', -1, -1, -1],
       ['opq', -1, -1, -1]], dtype=object)

In [34]:
l=1

for feature in tqdm(feature_names):
    temp=[]
    # on every occurrence of word getting its previous 4 words and next 4 words
    for essay in word:                   
        for i in getallindex(essay,feature):
            backward=''
            forward = ''
            for a in range(0,2):
                if (i-a)>=0:
                    backward+=essay[i-a]+' '
                if (i+a)<=(len(essay)-1):
                    forward+=essay[i+a]+' '
            if backward!='':
                temp.append(backward)
            if forward!='':
                temp.append(forward)
    c=1
    # calulating co-occerrence of word for all the windows
    for x in feature_names:
        count=0
        if (l==c):
            co_mat[l,c]=0
        elif (co_mat[l,c]==-1):
            for t in temp:
                if (x in t):
                    count+=1
            co_mat[l,c]=count
            co_mat[c,l]=count
        c+=1
    l+=1
        

100%|██████████| 3/3 [00:00<00:00, 10932.16it/s]


In [35]:
co_mat

array([[0, 'aaja,', 'fdads', 'opq'],
       ['aaja,', 0, 1, 0],
       ['fdads', 1, 0, 0],
       ['opq', 0, 0, 0]], dtype=object)

The above is on context window size 2 and all diagonal element are zero.
i referred to the below  link
https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec/