In [1]:
from collections import Counter
import itertools

import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg 
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from itertools import zip_longest

df = pd.read_csv('../input/abcnews-date-text.csv')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))
df['split'] = df.headline_text.str.split(' ')

df['length'] = df.split.map(len)
df = df.loc[df.length > 1]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#explode and sanitize

ex = df.explode('split')

ex = ex.loc[ex.split.str.len() > 2]

ex = ex.loc[~ex.split.isin(stopwords_set)]

In [3]:
#ceate dictionary of words

dictionary = ex.split.drop_duplicates()

dictionary = pd.Series(dictionary.tolist(), name='words')

dictionary = dictionary.to_frame()

dictionary['encode'] = dictionary.index

dictionary = dictionary.set_index('words')

dictionary.encode = dictionary.encode + 1
dictionary = dictionary.to_dict()['encode']

dictionary

{'aba': 1,
 'decides': 2,
 'community': 3,
 'broadcasting': 4,
 'licence': 5,
 'act': 6,
 'fire': 7,
 'witnesses': 8,
 'must': 9,
 'aware': 10,
 'defamation': 11,
 'calls': 12,
 'infrastructure': 13,
 'protection': 14,
 'summit': 15,
 'air': 16,
 'staff': 17,
 'aust': 18,
 'strike': 19,
 'pay': 20,
 'rise': 21,
 'affect': 22,
 'australian': 23,
 'travellers': 24,
 'ambitious': 25,
 'olsson': 26,
 'wins': 27,
 'triple': 28,
 'jump': 29,
 'antic': 30,
 'delighted': 31,
 'record': 32,
 'breaking': 33,
 'barca': 34,
 'aussie': 35,
 'qualifier': 36,
 'stosur': 37,
 'wastes': 38,
 'four': 39,
 'memphis': 40,
 'match': 41,
 'addresses': 42,
 'security': 43,
 'council': 44,
 'iraq': 45,
 'australia': 46,
 'locked': 47,
 'war': 48,
 'timetable': 49,
 'opp': 50,
 'contribute': 51,
 'million': 52,
 'aid': 53,
 'take': 54,
 'robson': 55,
 'celebrates': 56,
 'birthday': 57,
 'bathhouse': 58,
 'plans': 59,
 'move': 60,
 'ahead': 61,
 'big': 62,
 'hopes': 63,
 'launceston': 64,
 'cycling': 65,
 'cham

In [4]:

encode = ex.split.map(dictionary.get).to_frame()

encode

Unnamed: 0,split
0,1
0,2
0,3
0,4
0,5
...,...
1244183,1320
1244183,112469
1244183,6183
1244183,451


In [5]:
#processing intensive. Better one liner?
encode.index.astype('int')
encode.split.astype('int')
docs = encode.groupby(encode.index)['split'].apply(list)



In [6]:

test = list(zip(*zip_longest(*docs.to_list(), fillvalue=0)))

test

[(1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0),
 (6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0),
 (12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
 (16, 17, 18, 19, 20, 21, 0, 0, 0, 0, 0, 0),
 (16, 19, 22, 23, 24, 0, 0, 0, 0, 0, 0, 0),
 (25, 26, 27, 28, 29, 0, 0, 0, 0, 0, 0, 0),
 (30, 31, 32, 33, 34, 0, 0, 0, 0, 0, 0, 0),
 (35, 36, 37, 38, 39, 40, 41, 0, 0, 0, 0, 0),
 (18, 42, 43, 44, 45, 0, 0, 0, 0, 0, 0, 0),
 (46, 47, 48, 49, 50, 0, 0, 0, 0, 0, 0, 0),
 (46, 51, 52, 53, 45, 0, 0, 0, 0, 0, 0, 0),
 (34, 54, 32, 55, 56, 57, 0, 0, 0, 0, 0, 0),
 (58, 59, 60, 61, 0, 0, 0, 0, 0, 0, 0, 0),
 (62, 63, 64, 65, 66, 0, 0, 0, 0, 0, 0, 0),
 (62, 67, 68, 69, 70, 71, 0, 0, 0, 0, 0, 0),
 (72, 73, 74, 75, 76, 0, 0, 0, 0, 0, 0, 0),
 (77, 78, 79, 80, 81, 0, 0, 0, 0, 0, 0, 0),
 (82, 83, 80, 84, 85, 86, 0, 0, 0, 0, 0, 0),
 (87, 88, 89, 90, 91, 92, 0, 0, 0, 0, 0, 0),
 (93, 94, 95, 96, 97, 0, 0, 0, 0, 0, 0, 0),
 (98, 99, 100, 101, 0, 0, 0, 0, 0, 0, 0, 0),
 (102, 103, 104, 105, 106, 107, 0, 0, 0, 0, 0, 0),
 (108, 109, 110, 111, 61, 0