### Import Libraries

In [1]:
import csv
import string 
import re
import itertools
import pandas as ps 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
from itertools import combinations, permutations
from cleantext import clean   # !pip install clean-text
ps.options.mode.chained_assignment = None

### Read Data file containing subset of AG Corpus of News Articles

In [2]:
df = ps.read_csv('D:/NLP Project/Data/NewsData.csv')
df.drop(columns=['image','pubdate','video','rank'],axis=1,inplace = True)

### Dropping health documents

In [3]:
df = df[df.category!='Health'] # To maintain equal distribution between class of documents

In [4]:
df = df.reset_index()

### Encoding URLS of Documents to Numeric scale 

In [5]:
#Encoding URL's into unique ID's
label_encoder = LabelEncoder()
df['url'] = label_encoder.fit_transform(df['url'])

In [6]:
len(df)

18544

In [7]:
df['category'].value_counts()

Sci/Tech         5244
Entertainment    4685
Business         4583
Sports           4032
Name: category, dtype: int64

### Preprocessing of Source, Title and Description

In [8]:
# Conversion to Lower case.

df['source'] = df['source'].str.lower()
df['title'] = df['title'].str.lower()
df['description'] = df['description'].str.lower()

#Stripped the string.

df['source'] = df['source'].str.strip()
df['title'] = df['title'].str.strip()
df['description'] = df['description'].str.strip()

### Clean text in title and description

In [9]:
df['title'] = df.apply(lambda row: clean(row['title'],fix_unicode=True,no_urls=True,to_ascii=True,lower=True,no_emails=True,),axis=1)
df['description'] = df.apply(lambda row: clean(row['description'],fix_unicode=True,no_urls=True,to_ascii=True,lower=True,no_emails=True),axis=1)

### Remove Punctuations

In [10]:
df['title'] = df['title'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
df['description'] = df['description'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

### Tokenization

In [11]:
df['tokenized_title'] = df.apply(lambda row: word_tokenize(row['title']),axis=1)  # Adding in a new column
df['tokenized_description'] = df.apply(lambda row: word_tokenize(row['description']),axis=1)

### Combine tokens from title and description of a document 

In [12]:
df['doc'] = df['tokenized_title'] + df['tokenized_description']

### List of tokens in a doc

In [13]:
df['doc']

0        [roddick, smacks, voltchkov, with, a, record, ...
1        [sony, hit, by, ps3, launch, delay, tokyo, reu...
2        [word, of, oversupply, from, opec, punctures, ...
3        [google, maps, boosts, public, transportation,...
4        [vetri, made, the, right, call, by, going, to,...
                               ...                        
18539    [sorry, not, so, hard, for, elton, john, sir, ...
18540    [franz, win, good, for, rock, music, glasgow, ...
18541    [stewart, mcdonald, raise, their, standards, r...
18542    [merck, in, 700, million, schizophrenia, deal,...
18543    [salvation, army, prepared, for, tough, fundra...
Name: doc, Length: 18544, dtype: object

### Stopword & Non-alphanumeric characters removal

In [14]:
#stop word, digits,non-english alphabets removal
stop_words = set(stopwords.words('english'))
for i in range(len(df)):
    df['doc'][i] = [w for w in df['doc'][i] if not w in stop_words]
    df['doc'][i] = list(filter(lambda w: re.search("^[a-zA-Z]{3,}$", w) is not None, df['doc'][i]))
    df['doc'][i] = [w for w in df['doc'][i] if not w.isdigit()] # Remove digits

### Removal of Pronouns

In [15]:
# Removal of Pronouns (Singular and Plural)
for i in range(len(df)):
    tagged_doc = pos_tag(df.doc[i])
    edited_doc = [word for word,tag in tagged_doc if tag!= 'NNP' and tag!= 'NNPS']
    df.doc[i] = edited_doc

### Stemming & Lemmatization of tokens 

In [16]:
#Stemming and lemmatization
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
for i in range(len(df)):
    for j in range(len(df['doc'][i])):
        df['doc'][i][j] = lemmatizer.lemmatize(df['doc'][i][j])
        df['doc'][i][j] = stemmer.stem(df['doc'][i][j])

In [17]:
for i in range(len(df)):
    df['doc'][i] = [w for w in df['doc'][i] if not w in stop_words]
    df['doc'][i] = list(filter(lambda w: re.search("^[a-zA-Z]{3,}$", w) is not None, df['doc'][i]))

### Word count in each document

In [18]:
df['word_count'] = df.apply(lambda row: Counter(row['doc']),axis=1)  # Adding in a new column

In [19]:
df['word_count']

0        {'roddick': 2, 'smack': 1, 'voltchkov': 2, 're...
1        {'soni': 2, 'hit': 1, 'launch': 2, 'delay': 2,...
2        {'word': 1, 'oversuppli': 1, 'opec': 2, 'punct...
3        {'googl': 1, 'map': 1, 'boost': 1, 'public': 1...
4        {'vetri': 2, 'made': 1, 'right': 1, 'call': 2,...
                               ...                        
18539    {'sorri': 1, 'hard': 1, 'elton': 2, 'john': 2,...
18540    {'franz': 2, 'win': 1, 'good': 1, 'rock': 2, '...
18541    {'stewart': 2, 'mcdonald': 2, 'rais': 1, 'stan...
18542    {'merck': 2, 'million': 2, 'schizophrenia': 2,...
18543    {'salvat': 2, 'armi': 2, 'prepar': 2, 'tough':...
Name: word_count, Length: 18544, dtype: object

### Total tokens from the corpus

In [20]:
add_dict = Counter()

for i in range(len(df)):
    counter1 = Counter(df['word_count'][i])
    add_dict += counter1
    
final_dict = dict(add_dict) # Sum of all word_count dictionaries is final dictionary.
#print(final_dict)

In [21]:
len(final_dict)

34275

### Eliminating tokens with frequency less than 20

In [22]:
for k, v in list(final_dict.items()): # Threshold to eliminate in-frequent words.
    if v < 20:
        del final_dict[k]

### Length of BOW (Bag Of Words)

In [23]:
len(final_dict)

3664

In [24]:
final_dict

{'roddick': 56,
 'record': 730,
 'serv': 97,
 'american': 588,
 'andi': 66,
 'improv': 213,
 'servic': 945,
 'mark': 262,
 'blast': 77,
 'win': 1027,
 'vladimir': 24,
 'friday': 1072,
 'earn': 380,
 'unit': 808,
 'state': 953,
 'earli': 288,
 'lead': 756,
 'belaru': 20,
 'davi': 110,
 'cup': 436,
 'semifin': 75,
 'world': 1381,
 'group': 737,
 'soni': 220,
 'hit': 612,
 'launch': 607,
 'delay': 154,
 'tokyo': 217,
 'reuter': 3332,
 'corp': 812,
 'hrefurl': 5363,
 'share': 759,
 'open': 931,
 'percent': 745,
 'lower': 251,
 'yen': 58,
 'thursday': 1180,
 'said': 3154,
 'would': 835,
 'european': 335,
 'playstat': 44,
 'game': 1488,
 'machin': 93,
 'march': 176,
 'novemb': 152,
 'miss': 309,
 'holiday': 153,
 'shop': 139,
 'season': 679,
 'key': 221,
 'market': 933,
 'word': 117,
 'opec': 82,
 'oil': 857,
 'price': 883,
 'london': 406,
 'eas': 92,
 'today': 528,
 'head': 366,
 'crude': 141,
 'trader': 67,
 'continu': 383,
 'take': 925,
 'profit': 729,
 'year': 1885,
 'ralli': 168,
 'goog

### Bag of Words Construction

In [25]:
bag_of_words = []
for k, v in list(final_dict.items()):
    bag_of_words.append(k)

In [26]:
len(bag_of_words)

3664

In [27]:
bag_of_words = np.array(bag_of_words)
bag_of_words = np.unique(bag_of_words)

In [28]:
final_bag = dict()
for i in range(len(bag_of_words)):
    final_bag[bag_of_words[i]] = i

### Saving bag of words as array object

In [29]:
#Constructing the feature matrix and class labels vector
labels = list(set(df['category']))
labels = sorted(labels)

rows = len(df)
columns = len(final_bag)

with open('D:/NLP Project/Data/bag_of_words.npy', 'wb') as f:
    np.save(f,bag_of_words,allow_pickle=True)

### Constructing Feature matrix/Term document matrix of the corpus

In [30]:
feature_matrix = np.full((rows,columns),0)
class_labels   = np.full((rows),0)

for i in range(len(df)):
    ind_i = df['url'][i]
    #print("Document Index :"+str(ind_i))
    label =  labels.index(df['category'][i])
    class_labels [ind_i] = label
    #cnt = 0
    for j in range(len(df['doc'][i])):
        if df['doc'][i][j] in final_dict.keys(): ## To avoid Key Error in Dictionary.
            ind_j = final_bag[df['doc'][i][j]]
            feature_matrix[ind_i][ind_j]=1
            #cnt += 1
    #print(str(cnt)+ " updations for this article")

### Remove documents with all 0's in row from feature matrix

In [31]:
indices_to_remove = np.argwhere(np.all(feature_matrix == 0, axis=1))
print(len(indices_to_remove))
indices_to_remove = indices_to_remove.tolist()
indices_to_remove = [item for sublist in indices_to_remove for item in sublist]
#feature_matrix = np.delete(feature_matrix,indices_to_remove) # Feature matrix is reshaping while deleting, so using dataframes below and converting back to array.
class_labels = np.delete(class_labels,indices_to_remove)

943


### Convert matrix to dataframe

In [32]:
feat_df = ps.DataFrame(feature_matrix)
a_series = (feat_df != 0).any(axis=1)
new_feat = feat_df.loc[a_series]
feat_df = new_feat

In [33]:
word_in_doc = dict()
for i in range(len(feat_df.columns)):
    row_indices = np.where(feat_df[i]==1)
    word_in_doc[feat_df.columns[i]] = row_indices # word 0 : [1,25,30]
    tupl = word_in_doc[i]
    listt = list(tupl)
    row_indices = listt[0].tolist()
    word_in_doc[feat_df.columns[i]] = row_indices
    #print(word_in_doc)

In [34]:
feature_matrix = new_feat.to_numpy()

### Finding documents with words common among them

In [35]:
for k,v in word_in_doc.items():
    word_in_doc[k] = list(itertools.combinations(v, 2))

In [36]:
len(word_in_doc)

3664

In [37]:
all_tpls = []
for k,v in word_in_doc.items():
    all_tpls.append(v)

In [38]:
flat_tuples = [item for sublist in all_tpls for item in sublist] # Converting list of list to flat list.

In [39]:
len(flat_tuples)

69277103

In [40]:
cnt_tuples = Counter(flat_tuples)

In [41]:
cnt_tuple = dict(cnt_tuples)

### Construct Dataframe with URL Tuple and edge weight

In [42]:
weighted_edge_list = ps.DataFrame(cnt_tuples.items(),columns = ['url_tuple','edge_weight'])

### Expanding tuples into 2 columns (For source and destination nodes)

In [43]:
new_col_list = ['url1','url2']
for n,col in enumerate(new_col_list):
    weighted_edge_list[col] = weighted_edge_list['url_tuple'].apply(lambda url_tuple: url_tuple[n])

weighted_edge_list = weighted_edge_list.drop('url_tuple',axis=1)

In [44]:
weighted_edge_list.head()

Unnamed: 0,edge_weight,url1,url2
0,7,5361,5362
1,8,5361,5363
2,7,5361,5364
3,6,5361,5365
4,6,5361,5366


In [45]:
cols = ['url1','url2','edge_weight']
weighted_edge_list = weighted_edge_list[cols]
weighted_edge_list.head()

Unnamed: 0,url1,url2,edge_weight
0,5361,5362,7
1,5361,5363,8
2,5361,5364,7
3,5361,5365,6
4,5361,5366,6


In [46]:
len(weighted_edge_list)

45259074

### Saving the dataframe of edges and weights

In [50]:
weighted_edge_list.to_csv("D:/NLP Project/Data/edge_list.csv",index=None,line_terminator="\n")

### Saving the feature matrix and class labels as numpy array object

In [49]:
#saving feature matrix and class labels to numpy files
with open('D:/NLP Project/Data/feature_matrix.npy', 'wb') as f:
    np.save(f,feature_matrix,allow_pickle=True)
with open('D:/NLP Project/Data/class_lables.npy', 'wb') as f:
    np.save(f,class_labels,allow_pickle=True)