In [159]:
# IMPORTS
import nltk
import string
import re
import os
import numpy as np
# PRE PROCESSING
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# TSNE - TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
from yellowbrick.text import TSNEVisualizer

from sklearn.feature_extraction.text import CountVectorizer

In [123]:
# PIP INSTALLs
# !pip install yellowbrick

In [116]:
# PREPROCESS METHODS 

# SORT THE ARTICLES in folder
def sortedAlphanumeric(path):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(path, key=alphanum_key)

# LOWERCASE
def lowercase(file):
    return file.lower()

# REMOVE punctuation
def removePunctuation(file):
    return file.translate(str.maketrans('','', string.punctuation))

# REMOVE WHITE SPACE
def removeSpace(file):
    return " ".join(file.split())

# REMOVE STOPWORDS
def removeStopwords(file):
    stopWords = set(stopwords.words("english"))
    wordTokens = word_tokenize(file)
    l = [w for w in word_tokenize(file) if w not in set(stopwords.words("english"))]
    l = ' '.join(l)
    return l

# LEMMATIZATION to root form but still remaining a valid word in English instead of stemming (yields invalid words)
lemmatizer = WordNetLemmatizer()
def lemmatization(file):
    wordTokens = word_tokenize(file)
    l = [lemmatizer.lemmatize(w, pos='v') for w in wordTokens]
    l = ' '.join(l)
    return l


In [30]:
# READ ALL DOCUMENTS
article_path = '../data/articles/'
article_dirs = sortedAlphanumeric(os.listdir(article_path))
articles = [open(article_path + article).read() for article in article_dirs]

In [117]:
# PRE-PROCESS
# LOWERCASE
articles2 = []
for article in articles:
    articles2.append(lowercase(article))

# NO PUNCT
articles3 = []
for article in articles2:
    articles3.append(removePunctuation(article))

# REMOVE SPACE
articles4 = []
for article in articles3:
    articles4.append(removeSpace(article))

# REMOVE STOPWORDS
articles5 = []
for article in articles4:
    articles5.append(removeStopwords(article))

# TRANSFORM EACH WORD TO ROOT
articles6 = []
for article in articles5:
    articles6.append(lemmatization(article))

In [212]:
# use TFIDF to vectorize the text
# https://www.scikit-yb.org/en/latest/api/text/tsne.html
# https://buhrmann.github.io/tfidf-analysis.html
# https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf
import pandas as pd
from pathlib import Path
Path("tf_idf_output").mkdir(parents=True, exist_ok=True)
output_filenames = [str(txt_file).replace(".txt", ".csv").replace("txt/", "tf_idf_output/") for txt_file in article_dirs]
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
transformedDocs = vectorizer.fit_transform(articles6).toarray()

# for c, d in enumerate(transformedDocs):
#     tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), d))
    # one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    # one_doc_as_df.to_csv(output_filenames[c])

# list(zip(vectorizer.get_feature_names_out(), transformedDocs[0]))

In [232]:
z = []
for article in articles6:
    z.append(article.split())

uniqueWords = {x for l in z for x in l}
# https://beckernick.github.io/law-clustering/