In [1]:
from itertools import zip_longest
from matplotlib.pyplot import figure
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from scipy.sparse import csr_array
from scipy.sparse import find
from pickleshare import PickleShareDB
import math
import networkx as nx
from pyvis.network import Network

df = pd.read_csv('../input/abcnews-date-text.csv')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#tokenize and sanitize

#tokenize documents into individual words
df['tokenized'] = df.headline_text.str.split(' ')

#remove short documents from corpus
df['length'] = df.tokenized.map(len)
df = df.loc[df.length > 1]

#use random subset of corpus
df=df.sample(frac=0.01).reset_index()

#flatten all words into single series
ex = df.explode('tokenized')

#remove shorter words
ex = ex.loc[ex.tokenized.str.len() > 2]

#remove stopwords
ex = ex.loc[~ex.tokenized.isin(stopwords_set)]

In [3]:
#ceate dictionary of words

#shuffle for sparse matrix visual
dictionary = ex.tokenized.drop_duplicates().sample(frac=1)

#dataframe with (index/code):word
dictionary = pd.Series(dictionary.tolist(), name='words').to_frame()

#store code:word dictionary for reverse encoding
dictionary_lookup = dictionary.to_dict()['words']

#offset index to prevent clash with zero fill
dictionary['encode'] = dictionary.index + 1

#store word:code dictionary for encoding
dictionary = dictionary.set_index('words').to_dict()['encode']


In [4]:
#reduced grouping from group by dataframeto to group by series and aggregated to tuple
#improved from 30s to 20s with whole dataset

#use dictionary to encode each word to integer representation
encode = ex.tokenized.map(dictionary.get).to_frame()
encode.index.astype('int')
encode.tokenized.astype('int')



0         6535
0         5544
0         6278
0         7589
0         1635
         ...  
12427    12916
12428    11589
12428     2993
12428    11240
12428     2013
Name: tokenized, Length: 65850, dtype: int64

In [5]:

bidirectional_bigram  = encode.reset_index().rename(columns={'index':'doc1_1','tokenized':'word1'}).copy()

bidirectional_bigram[['doc1_2','word2']] = bidirectional_bigram.drop(0).reset_index()[['doc1_1','word1']].astype(int)

bidirectional_bigram = bidirectional_bigram.drop(bidirectional_bigram.tail(1).index).astype(int)

#shift that produces bigrams results in invalid bigrams
#number of invalid bigrams is equivalent to number of documents
#remove bigrams not from the same document

bidirectional_bigram = bidirectional_bigram[bidirectional_bigram.doc1_1 == bidirectional_bigram.doc1_2]

bidirectional_bigram['bidirectional_hash'] = bidirectional_bigram.word1.pow(3) + bidirectional_bigram.word2.pow(3)

bidirectional_hash_occurrences = bidirectional_bigram.bidirectional_hash.value_counts().to_dict()

word_occurrences = encode.tokenized.value_counts().to_dict()

num_bidirectional_bigrams = sum(bidirectional_hash_occurrences.values())

num_words = sum(word_occurrences.values())

#convert to symetric coordinates

bidirectional_bigram = pd.concat([bidirectional_bigram, bidirectional_bigram.rename(columns={'word1':'word2','word2':'word1'})],ignore_index=True)

bidirectional_bigram['bidirectional_hash_occurrences'] = bidirectional_bigram.bidirectional_hash.map(bidirectional_hash_occurrences.get)

bidirectional_bigram['word1_occurrences'] = bidirectional_bigram.word1.map(word_occurrences.get)

bidirectional_bigram['word2_occurrences'] = bidirectional_bigram.word2.map(word_occurrences.get)



p(x,y): probability of term x,y co-occurrence over corpus is (number of term co-occurrences) / (number of all word co-occurrences)

p(x): probability of term x individual occurrence over corpus is (number of term x occurrences) / (number of all word occurrences)

p(y): probability of term y individual occurrence over corpus is (number of term y occurrences) / (number of all word occurrences)

In [6]:
bidirectional_bigram['p_w1_w2'] = bidirectional_bigram.bidirectional_hash_occurrences / num_bidirectional_bigrams

bidirectional_bigram['p_w1'] = bidirectional_bigram.word1_occurrences /num_words

bidirectional_bigram['p_w2'] = bidirectional_bigram.word2_occurrences / num_words



pointwise muutual information

pmi(x;y) = log2((p(x,y))/(p(x)*p(y)))

In [7]:
bidirectional_bigram['pmi'] = np.log2(bidirectional_bigram.p_w1_w2/(bidirectional_bigram.p_w1*bidirectional_bigram.p_w2))

standard_deviation = bidirectional_bigram.pmi.std()

bidirectional_bigram.pmi = (bidirectional_bigram.pmi - bidirectional_bigram.pmi.mean())/standard_deviation

#offsetting by one standard deviation to accomodate sparse zerofill and onefill on diagonal.

bidirectional_bigram.pmi = (bidirectional_bigram.pmi - (bidirectional_bigram.pmi.min()-standard_deviation))/((bidirectional_bigram.pmi.max()+standard_deviation)-(bidirectional_bigram.pmi.min()-standard_deviation))

bidirectional_bigram[['word1','word2']] = bidirectional_bigram[['word1','word2']] - 1

pmi_matrix_coordinates = bidirectional_bigram[['word1','word2','pmi']].to_numpy().T

sparse_pmi_matrix = csr_array((pmi_matrix_coordinates[2],(pmi_matrix_coordinates[0],pmi_matrix_coordinates[1])), shape=(np.size(encode, 0),len(dictionary)), dtype=float) 

