In [1]:
from itertools import zip_longest
from matplotlib.pyplot import figure
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from scipy.sparse import csr_array
from scipy.sparse import find
from pickleshare import PickleShareDB
import math

df = pd.read_csv('../input/abcnews-date-text.csv')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#tokenize and sanitize

#tokenize documents into individual words
df['tokenized'] = df.headline_text.str.split(' ')

#remove short documents from corpus
df['length'] = df.tokenized.map(len)
df = df.loc[df.length > 1]

#use random subset of corpus
df=df.sample(frac=0.0016).reset_index()

#flatten all words into single series
ex = df.explode('tokenized')

#remove shorter words
ex = ex.loc[ex.tokenized.str.len() > 2]

#remove stopwords
ex = ex.loc[~ex.tokenized.isin(stopwords_set)]

In [3]:
#ceate dictionary of words

#shuffle for sparse matrix visual
dictionary = ex.tokenized.drop_duplicates().sample(frac=1)

#dataframe with (index/code):word
dictionary = pd.Series(dictionary.tolist(), name='words').to_frame()

#store code:word dictionary for reverse encoding
dictionary_lookup = dictionary.to_dict()['words']

#offset index to prevent clash with zero fill
dictionary['encode'] = dictionary.index + 1

#store word:code dictionary for encoding
dictionary = dictionary.set_index('words').to_dict()['encode']


In [4]:
#reduced grouping from group by dataframeto to group by series and aggregated to tuple
#improved from 30s to 20s with whole dataset

#use dictionary to encode each word to integer representation
encode = ex.tokenized.map(dictionary.get).to_frame()
encode.index.astype('int')
encode.tokenized.astype('int')



0       3825
0       1045
0        812
0       4516
0       2495
        ... 
1988    2265
1988    2678
1988    1485
1988    4459
1988    2966
Name: tokenized, Length: 10580, dtype: int64

In [18]:

bidirectional_bigram  = encode.reset_index().rename(columns={'index':'doc1_1','tokenized':'word1'}).copy()

bidirectional_bigram[['doc1_2','word2']] = bidirectional_bigram.drop(0).reset_index()[['doc1_1','word1']].astype(int)

bidirectional_bigram = bidirectional_bigram.drop(bidirectional_bigram.tail(1).index).astype(int)

#shift that produces bigrams results in invalid bigrams
#number of invalid bigrams is equivalent to number of documents
#remove bigrams not from the same document

bidirectional_bigram = bidirectional_bigram[bidirectional_bigram.doc1_1 == bidirectional_bigram.doc1_2]

bidirectional_bigram['bidirectional_hash'] = bidirectional_bigram.word1.pow(3) + bidirectional_bigram.word2.pow(3)

bidirectional_hash_occurrences = bidirectional_bigram.bidirectional_hash.value_counts().to_dict()

word_occurrences = encode.tokenized.value_counts().to_dict()

bidirectional_bigram['bidirectional_hash_occurrences'] = bidirectional_bigram.bidirectional_hash.map(bidirectional_hash_occurrences.get)

bidirectional_bigram['num_bidirectional_bigrams'] = sum(bidirectional_hash_occurrences.values())

bidirectional_bigram['word1_occurrences'] = bidirectional_bigram.word1.map(word_occurences.get)

bidirectional_bigram['word2_occurrences'] = bidirectional_bigram.word2.map(word_occurences.get)

bidirectional_bigram['num_words'] = sum(word_occurences.values())

bidirectional_bigram.sort_values('bidirectional_hash_occurrences', ascending=False).head(23)





Unnamed: 0,doc1_1,word1,doc1_2,word2,bidirectional_hash,bidirectional_hash_occurences,num_bidirectional_bigrams,word1_occurences,word2_occurences,num_words
1453,273,2129,273,1981,17424151830,7,8591,9,10,10580
4477,856,2129,856,1981,17424151830,7,8591,9,10,10580
4058,775,1299,775,652,2469101707,7,8591,19,12,10580
9375,1759,2129,1759,1981,17424151830,7,8591,9,10,10580
5397,1023,1299,1023,652,2469101707,7,8591,19,12,10580
9724,1824,2129,1824,1981,17424151830,7,8591,9,10,10580
2455,467,152,467,4806,111010750424,7,8591,17,14,10580
6006,1136,152,1136,4806,111010750424,7,8591,17,14,10580
2039,388,1299,388,652,2469101707,7,8591,19,12,10580
5552,1052,2129,1052,1981,17424151830,7,8591,9,10,10580



p(x,y):  probability of term x,y co-occurrence over corpus is (number of term co-occurrences) / (number of all word co-occurrences)

p(x): probability of term x individual occurrence over corpus is (number of term x occurrences) / (number of all word occurrences)

p(y): probability of term y individual occurrence over corpus is (number of term y occurrences) / (number of all word occurrences)