In [8]:
from itertools import zip_longest
from matplotlib.pyplot import figure
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from scipy.sparse import csr_array
from scipy.sparse import find
from scipy.sparse import spdiags
from pickleshare import PickleShareDB
import math
import networkx as nx
from pyvis.network import Network

df = pd.read_csv('../input/abcnews-date-text.csv')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
#tokenize and sanitize

#tokenize documents into individual words
df['tokenized'] = df.headline_text.str.split(' ')

#remove short documents from corpus
df['length'] = df.tokenized.map(len)
df = df.loc[df.length > 1]

#use random subset of corpus
#df=df.sample(frac=0.01).reset_index()

df = df.reset_index()

#flatten all words into single series
ex = df.explode('tokenized')

#remove shorter words
ex = ex.loc[ex.tokenized.str.len() > 2]

#remove stopwords
ex = ex.loc[~ex.tokenized.isin(stopwords_set)]

In [10]:
#ceate dictionary of words

#shuffle for sparse matrix visual
dictionary = ex.tokenized.drop_duplicates().sample(frac=1)

#dataframe with (index/code):word
dictionary = pd.Series(dictionary.tolist(), name='words').to_frame()

#store code:word dictionary for reverse encoding
dictionary_lookup = dictionary.to_dict()['words']

#offset index to prevent clash with zero fill
dictionary['encode'] = dictionary.index + 1

#store word:code dictionary for encoding
dictionary = dictionary.set_index('words').to_dict()['encode']

#use dictionary to encode each word to integer representation
encode = ex.tokenized.map(dictionary.get).to_frame()

#shift right column down by one to produce bigrams

bidirectional_bigram  = encode.reset_index().rename(columns={'index':'doc1_1','tokenized':'word1'}).copy()

bidirectional_bigram[['doc1_2','word2']] = bidirectional_bigram.drop(0).reset_index()[['doc1_1','word1']].astype(int)

bidirectional_bigram = bidirectional_bigram.drop(bidirectional_bigram.tail(1).index).astype(int)

#shift that produces bigrams results in invalid bigrams
#number of invalid bigrams is equivalent to number of documents
#remove bigrams not from the same document

bidirectional_bigram = bidirectional_bigram[bidirectional_bigram.doc1_1 == bidirectional_bigram.doc1_2]

#hash allows bigrams to be equivilent backwords of forwards thus bidirectional

bidirectional_bigram['bidirectional_hash'] = bidirectional_bigram.word1.pow(3) + bidirectional_bigram.word2.pow(3)

bidirectional_hash_occurrences = bidirectional_bigram.bidirectional_hash.value_counts().to_dict()

word_occurrences = encode.tokenized.value_counts().to_dict()

num_bidirectional_bigrams = sum(bidirectional_hash_occurrences.values())

num_words = sum(word_occurrences.values())

#convert to symetric coordinates

bidirectional_bigram = pd.concat([bidirectional_bigram, bidirectional_bigram.rename(columns={'word1':'word2','word2':'word1'})],ignore_index=True)

bidirectional_bigram['bidirectional_hash_occurrences'] = bidirectional_bigram.bidirectional_hash.map(bidirectional_hash_occurrences.get)

bidirectional_bigram['word1_occurrences'] = bidirectional_bigram.word1.map(word_occurrences.get)

bidirectional_bigram['word2_occurrences'] = bidirectional_bigram.word2.map(word_occurrences.get)



p(x,y): probability of term x,y co-occurrence over corpus is (number of term co-occurrences) / (number of all word co-occurrences)

p(x): probability of term x individual occurrence over corpus is (number of term x occurrences) / (number of all word occurrences)

p(y): probability of term y individual occurrence over corpus is (number of term y occurrences) / (number of all word occurrences)

In [11]:
bidirectional_bigram['p_w1_w2'] = bidirectional_bigram.bidirectional_hash_occurrences / num_bidirectional_bigrams

bidirectional_bigram['p_w1'] = bidirectional_bigram.word1_occurrences /num_words

bidirectional_bigram['p_w2'] = bidirectional_bigram.word2_occurrences / num_words



pointwise muutual information

pmi(x;y) = log2((p(x,y))/(p(x)*p(y)))

In [12]:
bidirectional_bigram['pmi'] = np.log2(bidirectional_bigram.p_w1_w2/(bidirectional_bigram.p_w1*bidirectional_bigram.p_w2))

standard_deviation = bidirectional_bigram.pmi.std()

bidirectional_bigram.pmi = (bidirectional_bigram.pmi - bidirectional_bigram.pmi.mean())/standard_deviation

#offsetting by one standard deviation to accomodate sparse zerofill and onefill on diagonal.

bidirectional_bigram.pmi = (bidirectional_bigram.pmi - (bidirectional_bigram.pmi.min()-standard_deviation))/((bidirectional_bigram.pmi.max()+standard_deviation)-(bidirectional_bigram.pmi.min()-standard_deviation))

bidirectional_bigram_diagonal = bidirectional_bigram.word1.drop_duplicates().to_frame()

bidirectional_bigram_diagonal['word2'] = bidirectional_bigram_diagonal.word1

bidirectional_bigram_diagonal['pmi'] = 1

bidirectional_bigram = pd.concat([bidirectional_bigram, bidirectional_bigram_diagonal],ignore_index=True)

bidirectional_bigram[['word1','word2']] = bidirectional_bigram[['word1','word2']] - 1

pmi_matrix_coordinates = bidirectional_bigram[['word1','word2','pmi']].to_numpy().T

sparse_pmi_matrix = csr_array((pmi_matrix_coordinates[2],(pmi_matrix_coordinates[0],pmi_matrix_coordinates[1])), shape=(np.size(encode, 0),len(dictionary)), dtype=float) 





In [21]:
sparse_pmi_matrix_compressed = sparse_pmi_matrix[(-sparse_pmi_matrix.sum(axis = 1)).argsort()[:20]].toarray()

pd.DataFrame((-sparse_pmi_matrix_compressed).argsort(axis = 1)[:20,:20].T).applymap(dictionary_lookup.get)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,investigate,zealand,charged,minister,face,considers,house,south,urged,man,rural,restrictions,mental,hits,open,toll,extended,hobart,car,basin
1,probe,york,jailed,govt,told,seeks,crews,day,nsw,woman,govt,supply,service,bounce,dollar,penalty,michael,western,plane,water
2,officer,year,dies,union,front,says,destroys,coronavirus,vic,murder,police,plan,minister,fight,south,suspicious,nrl,man,fatal,management
3,hunt,years,arrested,expert,faces,rejects,threat,cricket,qld,teen,country,use,services,track,share,custody,john,harbour,road,housing
4,search,laws,missing,opposition,high,backs,govt,live,fed,men,government,bans,qld,back,year,baby,david,airport,bus,new
5,seek,home,accused,trump,hears,city,danger,western,local,alleged,premier,drinking,system,urged,shares,inquest,james,siege,truck,development
6,say,south,killed,council,supreme,land,service,india,says,two,northern,recycled,indigenous,bounces,man,stabbing,ben,morning,highway,merger
7,arrest,cases,found,group,accused,wants,ban,central,defends,fatal,coronavirus,authority,workers,court,central,womans,andrew,news,victim,health
8,nsw,papua,pleads,labor,fronts,urged,factory,post,accused,child,western,price,care,work,market,sparks,matt,shooting,chopper,park
9,station,gets,faces,report,appears,coast,season,says,rejects,driver,election,pipeline,funding,fights,first,mans,chris,south,victims,sparks
