In [76]:
import os
import nltk
import re
# import nltk libraries for preprocessing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import json
import math
import numpy as np
from tqdm import tqdm
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ussin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
os.chdir('News Articles')

In [63]:
# make a preprocess function that takes in a string and returns a list of words
def preprocess(text):
    # remove punctuation
    text = re.sub(r'[^\w\s]','',text)
    # make all words lowercase
    text = text.lower()
    # split into a list of words
    text = text.split()
    # remove stopwords
    text = [word for word in text if word not in stopwords.words('english')]
    # remove words with length==1
    text = [word for word in text if len(word)>1]
    # lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

In [64]:
# open files in the directory each file contains add en extra section in dictionary of the articles with the key as 'article-id' and vallue as the file name+article number
# for example if the file name is 'abc-news.json' and the article number is 1 then the key value pair will be 'article-id':'abc-news/1'
# append this key in the dictionary of each article in the file and update the file with the new dictionary as json
def update():
    for file in os.listdir():
        if len(file.split('_')) > 1:
                continue
        with open(file, 'r') as f:
            data_dict = json.load(f)
        for i in range(len(data_dict['articles'])):
            data_dict['articles'][i]['article-id'] = file.split('.')[0]+'/'+str(i+1)
            f.close()
        with open(file, 'w') as f:
            json.dump(data_dict, f, indent = 4)
            f.close()

In [52]:
# update()

In [67]:
# now we preprocess the text in each article and store it in a dictionary with key as the article-id and value as the list of words
# send the author title description and content of each article to the preprocess function and store the list of words in a dictionary
# with key as the article-id and value as the list of words in a new file called 'filename_preprocessed.json'
def preprocess_articles():
    preprocessed_dict = {}
    for file in os.listdir():
        print(file)
        with open(file, 'r') as f:
            if len(file.split('_')) > 1:
                continue
            data_dict = json.load(f)
            for i in range(len(data_dict['articles'])):
                # only preprocess article section that are not empty
                preprocessed_dict[data_dict['articles'][i]['article-id']] = []
                if data_dict['articles'][i]['content'] != None:
                    preprocessed_dict[data_dict['articles'][i]['article-id']].extend(preprocess(data_dict['articles'][i]['content']))
                if data_dict['articles'][i]['title'] != None:
                    preprocessed_dict[data_dict['articles'][i]['article-id']].extend(preprocess(data_dict['articles'][i]['title']))
                if data_dict['articles'][i]['description'] != None:
                    preprocessed_dict[data_dict['articles'][i]['article-id']].extend(preprocess(data_dict['articles'][i]['description']))
                if data_dict['articles'][i]['author'] != None:
                    preprocessed_dict[data_dict['articles'][i]['article-id']].extend(preprocess(data_dict['articles'][i]['author']))
            f.close()
    with open('preprocessed_files.json', 'w') as f:
        json.dump(preprocessed_dict, f, indent = 4)
        f.close()

AI.json
AI_preprocessed.json
business.json
business_preprocessed.json
chess.json
chess_preprocessed.json
climate change.json
climate change_preprocessed.json
comedy.json
comedy_preprocessed.json
cricket.json
cricket_preprocessed.json
crypto.json
crypto_preprocessed.json
entertainment.json
entertainment_preprocessed.json
environment.json
environment_preprocessed.json
politics.json
politics_preprocessed.json
preprocessed_files.json
social justice.json
social justice_preprocessed.json
spacex.json
spacex_preprocessed.json
sports.json
sports_preprocessed.json
sustainable.json
sustainable_preprocessed.json
tech.json
tech_preprocessed.json
tesla.json
tesla_preprocessed.json
women rights.json
women rights_preprocessed.json
youth activism.json
youth activism_preprocessed.json


In [60]:
# now we create a dictionary with key as the document id and value as along with the link of the article only and no other information and save the whole dict as map.json
def create_map():
    map_dict = {}
    for file in os.listdir():
        if len(file.split('_')) > 1:
            continue
        with open(file, 'r') as f:
            data_dict = json.load(f)
            for i in range(len(data_dict['articles'])):
                map_dict[data_dict['articles'][i]['article-id']] = data_dict['articles'][i]['url']
    with open('maplink.json', 'w') as f:
        json.dump(map_dict, f, indent = 4)
        f.close()

In [61]:
# create_map()

In [68]:
# create a dictionary for term frequency that has key as the article-id and value as a dictionary with key as the word and value as the frequency of the word in the article

def create_tf():
    tf_dict = {}
    with open('preprocessed_files.json', 'r') as f:
        preprocessed_dict = json.load(f)
        for key in preprocessed_dict.keys():
            tf_dict[key] = {}
            for word in preprocessed_dict[key]:
                if word in tf_dict[key].keys():
                    tf_dict[key][word] += 1
                else:
                    tf_dict[key][word] = 1
    with open('tf.json', 'w') as f:
        json.dump(tf_dict, f, indent = 4)
        f.close()

# create_tf()

In [75]:
# create posting list that has key as the word and value as a list of article-ids in which the word is present and the number of documents in which the word is present
def create_posting_list():
    posting_list = {}
    with open('preprocessed_files.json', 'r') as f:
        preprocessed_dict = json.load(f)
        for key in preprocessed_dict.keys():
            for word in preprocessed_dict[key]:
                if word in posting_list.keys():
                    posting_list[word][0].append(key)
                    posting_list[word][1] += 1
                else:
                    posting_list[word] = [[key], 1]
    with open('posting_list.json', 'w') as f:
        json.dump(posting_list, f, indent = 4)
        f.close()

# create idf dict
def create_idf_dict(no_of_docs, posting_list):
    idf_dict = {}
    for token in posting_list:
        idf_dict[token] = math.log(no_of_docs / posting_list[token][1]+1)
    with open('idf.json', 'w') as f:
        json.dump(idf_dict, f, indent = 4)
        f.close()

posting_list = json.load(open('posting_list.json', 'r'))
number_of_docs = len(json.load(open('preprocessed_files.json', 'r')))
create_idf_dict(number_of_docs, posting_list)

In [89]:
# create the tf-idf matrix for each document and 
# Weighting Scheme TF Weight
# Binary 0,1
# Raw count f(t,d)
# Term frequency f(t,d)/Pf(t‘, d)
# Log normalization log(1+f(t,d))
# Double normalization 0.5+0.5*(f(t,d)/ max(f(t‘,d))

def create_tf_idf_matrix_binary(no_of_docs, tf_dict, idf_dict):
    vocab_size = len(idf_dict)
    tf_idf_matrix = np.zeros((no_of_docs, vocab_size))
    # check for all terms in the idf_dict, if it is present in the tf_dict of the document, then set the value to 1*idf
    for i, filename in enumerate(tqdm(tf_dict.keys())):
        for j, token in enumerate(idf_dict):
            if token in tf_dict[filename]:
                tf_idf_matrix[i][j] = 1*idf_dict[token]
                
    return tf_idf_matrix

def create_tf_idf_matrix_raw_count(no_of_docs, tf_dict, idf_dict):
    vocab_size = len(idf_dict)
    tf_idf_matrix = np.zeros((no_of_docs, vocab_size))
    for i, filename in enumerate(tqdm(tf_dict.keys())):
        for j, token in enumerate(idf_dict):
            if token in tf_dict[filename]:
                tf_idf_matrix[i][j] = tf_dict[filename][token]*idf_dict[token]
                
    return tf_idf_matrix

def create_tf_idf_matrix_term_frequency(no_of_docs, tf_dict, idf_dict):
    vocab_size = len(idf_dict)
    tf_idf_matrix = np.zeros((no_of_docs, vocab_size))
    for i, filename in enumerate(tqdm(tf_dict.keys())):
        for j, token in enumerate(idf_dict):
            if token in tf_dict[filename]:
                tf_idf_matrix[i][j] = (tf_dict[filename][token]/len(tf_dict[filename]))*idf_dict[token]

    return tf_idf_matrix

def create_tf_idf_matrix_log_normalization(no_of_docs, tf_dict, idf_dict):
    vocab_size = len(idf_dict)
    tf_idf_matrix = np.zeros((no_of_docs, vocab_size))
    for i, filename in enumerate(tqdm(tf_dict.keys())):
        for j, token in enumerate(idf_dict):
            if token in tf_dict[filename]:
                tf_idf_matrix[i][j] = math.log(1+tf_dict[filename][token])*idf_dict[token]
    
    return tf_idf_matrix

def create_tf_idf_matrix_double_normalization(no_of_docs, tf_dict, idf_dict):
    vocab_size = len(idf_dict)
    tf_idf_matrix = np.zeros((no_of_docs, vocab_size))
    for i, filename in enumerate(tqdm(tf_dict.keys())):
        max_tf = 0
        for token in tf_dict[filename]:
            if tf_dict[filename][token] > max_tf:
                max_tf = tf_dict[filename][token]
        for j, token in enumerate(idf_dict):
            if token in tf_dict[filename]:
                tf_idf_matrix[i][j] = 0.5+0.5*(tf_dict[filename][token]/max_tf)*idf_dict[token]
    return tf_idf_matrix


In [90]:
# create tf-idf matrix
binary_tf_idf_matrix = create_tf_idf_matrix_binary(number_of_docs, json.load(open('tf.json', 'r')), json.load(open('idf.json', 'r')))
raw_count_tf_idf_matrix = create_tf_idf_matrix_raw_count(number_of_docs, json.load(open('tf.json', 'r')), json.load(open('idf.json', 'r')))
term_frequency_tf_idf_matrix = create_tf_idf_matrix_term_frequency(number_of_docs, json.load(open('tf.json', 'r')), json.load(open('idf.json', 'r')))
log_normalization_tf_idf_matrix = create_tf_idf_matrix_log_normalization(number_of_docs, json.load(open('tf.json', 'r')), json.load(open('idf.json', 'r')))
double_normalization_tf_idf_matrix = create_tf_idf_matrix_double_normalization(number_of_docs, json.load(open('tf.json', 'r')), json.load(open('idf.json', 'r')))

100%|██████████| 1800/1800 [00:07<00:00, 250.17it/s]
100%|██████████| 1800/1800 [00:04<00:00, 408.56it/s]
100%|██████████| 1800/1800 [00:04<00:00, 434.60it/s]
100%|██████████| 1800/1800 [00:04<00:00, 428.04it/s]
100%|██████████| 1800/1800 [00:04<00:00, 418.48it/s]
