In [None]:
%pip install regex
%pip install nltk
%pip install unicodedata-reader
%pip install contractions
%pip install inflect
%pip install emoji
%pip install spacy

Collecting unicodedata-reader
  Downloading unicodedata_reader-1.3.6-py3-none-any.whl.metadata (3.4 kB)
Downloading unicodedata_reader-1.3.6-py3-none-any.whl (20 kB)
Installing collected packages: unicodedata-reader
Successfully installed unicodedata-reader-1.3.6


In [None]:
import re
import nltk
import emoji
import unicodedata
import contractions
import inflect
import spacy
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize # Import sent_tokenize as well
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
def clean_text(input_text):

    # HTML Tags: The first step is to remove all HTML tags inside the input text
    clean_text = re.sub('<[^<]+?>', '', input_text)

    # URLs and Links: Next, we remove every URL and link from the text
    clean_text = re.sub(r'http\S+', '', clean_text)

    # Emojis and Emoticons: We use the self-defined function below to convert emojis to text
    # This is important for understanding the sentiment of the text being presented
    clean_text = convert_emojis_to_words(clean_text)

    # Lowercase all the input data
    clean_text = clean_text.lower()

    # Remove all White Spaces
    # Since all the data is now words, let's clean any white spaces
    clean_text = re.sub('\s+', ' ', clean_text)

    # Accented Characters to ASCII Characters: We use the unicode normalize function to convert all accented characters to ASCII characters
    clean_text = unicodedata.normalize('NFKD', clean_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Expand contractions: Text often contains words like "don't" or "won't", let us expand those
    clean_text = contractions.fix(clean_text)

    # Remove special characters: Removing anything that is not "words"
    clean_text = re.sub('[^a-zA-Z0-9\s\.]', '', clean_text)

    # Convert number words to numeric form
    temp = inflect.engine()
    words = []
    for word in clean_text.split():
        if word.isdigit():
            words.append(temp.number_to_words(word))
        else:
            words.append(word)
    clean_text = ' '.join(words)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(clean_text)
    tokens = [token for token in tokens if token not in stop_words]
    clean_text = ' '.join(tokens)

    # Add full-stop to end of sentences
    clean_text = re.sub('([a-z])\.([A-Z])', r'\1. \2', clean_text)

    # Remove punctuations
    clean_text = re.sub(r'[^\w\s.]', '', clean_text)

    # Return the preprocessed, clean text
    return clean_text

In [None]:
def convert_emojis_to_words(text):

    # Convert emojis to words
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Remove the : from the words and replace _ with space
    text = text.replace(":", "").replace("_", " ")

    return text

In [None]:
def remove_noise_boilerplate(input_text, min_cluster_size=2, num_clusters=5, max_noise_ratio=0.3):

    # Sentence split: To identify boilerplate/noise we will first need to separate sentences to find similarity
    sentences = re.split('\. |\? |\! |\n', input_text)

    # Convert sentences to a matrix of word embeddings
    embeddings_matrix = text_vectorize(sentences)

    # KMeans Clustering: Cluster the sentences to bring similar embeddings together
    kmeans_model = KMeans(n_clusters=num_clusters)
    kmeans_model.fit(embeddings_matrix)
    model_labels = kmeans_model.labels_
    model_centroids = kmeans_model.cluster_centers_

    # Individual cluster size
    cluster_sizes = np.bincount(model_labels)

    # Identify clusters with noise and boilerplate language
    is_noise = np.zeros(num_clusters, dtype=bool)
    for i, centroid in enumerate(model_centroids):
        if cluster_sizes[i] < min_cluster_size:
            # We should ignore clusters with fewer sentences than min_cluster_size threshold
            continue
        distances = np.linalg.norm(embeddings_matrix[model_labels == i] - centroid, axis=1)
        median_distance = np.median(distances)
        if np.count_nonzero(distances > median_distance) / cluster_sizes[i] > max_noise_ratio:
            is_noise[i] = True

    # Remove: Sentences that are in the noise bucket, we remove them (boilerplate)
    filtered_sentences = []
    for i, sentence in enumerate(sentences):
        if not is_noise[model_labels[i]]:
            filtered_sentences.append(sentence)

    # Bring the sentence together
    filtered_text = ' '.join(filtered_sentences)

    return filtered_text

In [None]:
def text_vectorize(input_text):

    # Instantiate the CountVectorizer object
    vectorizer = CountVectorizer()

    # Use vectorizer.fit to transform the text into a matrix of word counts
    counts_matrix = vectorizer.fit_transform(input_text)

    # Convert to a dense matrix
    dense_matrix = counts_matrix.todense()

    # Return the dense matrix as a numpy array
    return np.array(dense_matrix)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def pos_tag(input_text):

    doc = nlp(input_text)
    tagged_output = []

    # Iterate over each token in the document
    for token in doc:
        # Append the token text and its POS tag to the tagged_output list
        tagged_output.append(token.text + '_' + token.pos_)

    # Join the tagged_output list into a single string
    tagged_output_str = ' '.join(tagged_output)

    return tagged_output_str

In [None]:
def lemmatize_and_vectorize(tagged_text):

    # Convert the tagged text to a string
    text = " ".join([word.split("_")[0] for word in tagged_text.split()])

    # Apply the Spacy pipeline to the text
    doc = nlp(text)

    vector_list = []

    # Iterate over each token in the Spacy document
    for token in doc:
        lemma = token.lemma_

        # Get the part-of-speech tag for the token
        pos = token.pos_

        if pos == "VERB":
            vec = token.vector
        else:
            vec = token.vector + nlp(pos).vector

        vector_list.append(vec)

    vector_array = np.array(vector_list)

    return vector_array


In [None]:
input_text = """Jesse McFadden, the convicted rapist who is believed to have killed his wife and five teenagers on his rural Oklahoma property before dying by suicide, was already in state prison in 2017 when he was charged with new sex crimes stemming from his alleged use of a contraband cellphone. But he was released in 2020, and his case languished in the court system for 2½ years.
On Monday, on the same day he was scheduled to appear in a Muskogee County court for the start of a jury trial, investigators looking for two missing teenagers who were last known to be with him executed a search warrant on his property. A local police chief said they made a grim discovery: Each of the seven bodies were shot in the head with a 9 mm pistol.
The sudden tragedy and the chain of events that preceded it has victims' families and state lawmakers asking why McFadden was released if he was facing new sex crime charges allegedly committed while behind bars while serving out a rape sentence, and if the deaths could have been prevented.
"There needs to be repercussions and somebody needs to be held accountable," Justin Webster, the father of Ivy Webster, 14, who had been reported missing and was among the victims, told The Associated Press. "They let a monster out. They did this."
State Rep. Scott Fetgatter, a Republican whose district includes the area south of Tulsa where the killings occurred, said he hopes to introduce a bill this legislative session that would halt the release of inmates before they complete their sentences if they are accused of committing certain sex crimes, such as rape and child sex abuse, while incarcerated.
"At the end of the day, those five children that were murdered in my district should be alive today and, instead, they're not," he said Thursday. "That's my priority: How do I keep this from ever happening again?"
McFadden was convicted in 2003 of first-degree rape and grand larceny and sentenced to 20 years in prison. In court documents, McFadden, then 20, told officials that he was "strung out on dope, vodka" and had "lost control of myself and raped a female friend." He also said he stole $80,000 from his grandfather's safe and "blew it on drugs and unnecessary things."
The female he raped was 17, and he had tied her hands and feet to a bedpost, cut her shirt off with a knife and threatened to use the knife on her "if she did not shut up," prosecutors said in a court filing.
From 2004 to 2010, McFadden was cited seven times for various infractions while in prison, including for having tobacco and engaging in sexual conduct with another inmate, according to his misconduct form.
Then, he was cited once in 2013 and again in 2016 for possessing a cellphone. In December 2016, an audit of the electronic device he allegedly was using revealed "sexually-themed conversations, videos and pictures" with a girl about 16 years old.
An investigation by the state Department of Corrections led prosecutors in Muskogee County to bring charges of child pornography and soliciting sexual conduct/communication with a minor. A preliminary hearing and a trial were rescheduled multiple times.
In the meantime, McFadden was on course to be released under a state law that allows those who commit violent felonies to be set free after serving at least 85% of their sentence. Despite his history of misconduct, he was considered a "level 4" inmate, reserved for those who meet their program requirements and maintain good personal hygiene and a satisfactory relationship with staff and others.
Based on how much prison time he had served, as well as time already served in county jail, McFadden was eligible for release Oct. 30, 2020. After his release, he was arrested the following month on the new charges and jailed for five days before he was let go on a $25,000 bond, records show.
Kay Thompson, a Department of Corrections spokeswoman, said that even though he was charged while already in prison, he had not yet been convicted, and so he fit the criteria to be released. According to his prison record, he earned his high school equivalency diploma and completed a Bible correspondence course called "A Country Called Heaven" and another 13-week course titled "Cage Your Rage."
Muskogee County District Attorney Larry Edwards did not immediately respond to a request for comment, but told the CBS affiliate KOTV-DT in Tulsa that McFadden's case was beset by unexpected delays over the years, including one prosecutor leaving for a new job, another breaking her foot before a scheduled trial date and the Covid pandemic causing a widespread logjam in the legal system.
When McFadden was released, he had technically completed his sentence and was not on probation. But he was still required to register as a sex offender for life and was checking in with the local sheriff's office every 90 days, as required, according to the Department of Corrections. In addition, registered sex offenders can live with children as long as they don’t commit crimes against them.
McFadden was living with his wife, Holly, whom he had married last year, Okmulgee County records show. Holly McFadden's three children — Rylee Allen, 17, Michael Mayo, 15, and Tiffany Guess, 13 — were among the victims.
Ivy and another victim, Brittany Brewer, 15, were friends with Tiffany and routinely slept over at the rental property where the McFaddens lived just outside the small town of Henryetta, the teens' families said.
Authorities said Wednesday that all the victims, as well as McFadden, were shot in the head. A motive was not immediately known.
Brittany's father, Nathan Brewer, previously said that McFadden seemed like a "nice, normal person," but now believes he should have never been released from prison after he was charged in 2017. Officials should be held liable for the victims' deaths, he added.
Holly McFadden's mother has also said that her daughter didn't know "the truth about Jesse McFadden" and that he "fooled her with his charm."
The state had planned to introduce evidence of McFadden's "prior bad acts" at his trial that was set to begin this week. In court documents, the state accused him of sending handwritten letters and text messages to the 16-year-old "in which he not only discussed their relationship but also discussed things of a sexual nature and was manipulative and controlling of the victim."
According to the state, the defense counsel planned to argue that McFadden was in contact not with the teen but with her 21-year-old friend, instead, and that another inmate owned the phone he was allegedly using.
State Rep. Justin Humphrey, a Republican who chairs the Criminal Justice and Corrections Committee, said he supports a change in the law that would prevent someone already in prison from being released if they are charged with a sex crime while behind bars.
"Who dropped the ball?" he asked. "Why did the trial take so long and why did they set the bond at what some might consider such a low amount for someone who's been convicted of such a violent crime?"
Dan Medlock, a criminal defense attorney and former prosecutor in Muskogee County, said the bond amount McFadden received is standard in the area.
But Humphrey said a larger look at the state's justice system is warranted when a person accused of committing a sex crime while already in prison can be freed.
"I don't think we need a knee-jerk reaction, but when six people are killed, including children, that has to be something that makes us go to the drawing board and do this thing right and get a system that holds people accountable," he said.
"""

In [None]:
clean_text = clean_text(input_text)
print(clean_text)

jesse mcfadden convicted rapist believed killed wife five teenagers rural oklahoma property dying suicide already state prison two thousand seventeen charged new sex crimes stemming alleged use contraband cellphone . released two thousand twenty case languished court system two hundred twelve years . monday day scheduled appear muskogee county court start jury trial investigators looking two missing teenagers last known executed search warrant property . local police chief said made grim discovery seven bodies shot head nine mm pistol . sudden tragedy chain events preceded victims families state lawmakers asking mcfadden released facing new sex crime charges allegedly committed behind bars serving rape sentence deaths could prevented . needs repercussions somebody needs held accountable justin webster father ivy webster fourteen reported missing among victims told associated press . let monster . . state rep. scott fetgatter republican whose district includes area south tulsa killings 

In [None]:
noise_free_text = remove_noise_boilerplate(clean_text)
print(noise_free_text)

monday day scheduled appear muskogee county court start jury trial investigators looking two missing teenagers last known executed search warrant property  despite history misconduct considered level four inmate reserved meet program requirements maintain good personal hygiene satisfactory relationship staff others  according prison record earned high school equivalency diploma completed bible correspondence course called country called heaven another 13week course titled cage rage  brittanys father nathan brewer previously said mcfadden seemed like nice normal person believes never released prison charged 2017


In [None]:
tagged_output = pos_tag(noise_free_text)
print(tagged_output)

monday_PROPN day_NOUN scheduled_VERB appear_VERB muskogee_PROPN county_PROPN court_PROPN start_VERB jury_NOUN trial_NOUN investigators_NOUN looking_VERB two_NUM missing_ADJ teenagers_NOUN last_ADV known_VERB executed_VERB search_NOUN warrant_NOUN property_NOUN  _SPACE despite_SCONJ history_NOUN misconduct_NOUN considered_VERB level_NOUN four_NUM inmate_NOUN reserved_VERB meet_NOUN program_NOUN requirements_NOUN maintain_VERB good_ADJ personal_ADJ hygiene_NOUN satisfactory_ADJ relationship_NOUN staff_NOUN others_NOUN  _SPACE according_VERB prison_NOUN record_NOUN earned_VERB high_ADJ school_NOUN equivalency_NOUN diploma_NOUN completed_VERB bible_ADJ correspondence_NOUN course_NOUN called_VERB country_NOUN called_VERB heaven_PROPN another_DET 13week_NOUN course_NOUN titled_VERB cage_NOUN rage_NOUN  _SPACE brittanys_PROPN father_PROPN nathan_PROPN brewer_PROPN previously_ADV said_VERB mcfadden_NOUN seemed_VERB like_ADP nice_ADJ normal_ADJ person_NOUN believes_AUX never_ADV released_VERB p

In [None]:
vectorized_output = lemmatize_and_vectorize(tagged_output)
print(vectorized_output)

[[-1.1245968  -0.722436    0.8786391  ...  0.38650048 -0.03109604
   0.9828768 ]
 [-1.658029   -1.4115163  -0.9507278  ...  0.34089205 -0.20330358
   1.1295211 ]
 [ 2.2415364   0.5276379  -1.0947719  ...  1.8501201   0.3669272
   0.3775242 ]
 ...
 [-1.6675895  -1.1257521   0.5898201  ... -0.3178092  -0.5571387
   0.96703255]
 [ 0.84237754  0.66871357 -0.45621735 ...  1.1780022  -0.01667696
   0.8144401 ]
 [-1.5609207   0.6319152   0.49138743 ... -0.5132434  -0.9589695
   1.4755664 ]]
