In [1]:
import spacy 
import string
from nltk.stem.porter import PorterStemmer

In [2]:
def get_text(file_name):
    """Reads a text file and returns the text as a string."""
    with open(file_name , "r",encoding="utf-8") as f:
        return f.read().strip()

get_text('doc.txt')

'Washington, DC CNN\n\nHome sales dropped in September to the lowest level in 13 years as surging interest rates and climbing home prices made buying a home unattainable for a growing share of would-be buyers.\n\nHistorically low inventory of homes for sale continued to push prices up and rates that crossed over 7% in August put a damper on sales, according to a monthly report from the National Association of Realtors.\n\nThe median price for existing homes — which include single-family homes, townhomes, condominiums and co-ops — was $394,300 last month. That was up 2.8% from a year ago and marked the third consecutive month of year-over-year price increases, setting a record high price for homes in September. Prices rose in all four regions of the country, the Northeast, Midwest, South and the West, the NAR report found.\n\n“For the third straight month, home prices are up from a year ago, confirming the pressing need for more housing supply,” Lawrence Yun, NAR chief economist said.\n

In [3]:
def output_txt(file_name, text):
    """Writes the text to a file."""
    with open(file_name, "w",encoding="utf-8") as f:
        f.write(text)

In [4]:
def clean_text(input_file, output_file):
    # Load the spaCy model for English
    nlp = spacy.load("en_core_web_sm")
    text = get_text(input_file)

    # Convert the text to lowercase
    text = text.lower()

    # Process the lowercase text using spaCy
    doc = nlp(text)

    # Extract non-stop words, non-punctuation tokens, and lemmatized forms
    cleaned_text = " ".join(token.lemma_ for token in doc if not token.is_stop and token.text not in string.punctuation)

    # Removal of frequent words from the text
    word_count = cleaned_text.split()
    freq_words = ([word for word in word_count if word_count.count(word) > 8])
    cleaned_text = " ".join(word for word in cleaned_text.split() if word not in freq_words)

    # Removal of rare words
    word_count = cleaned_text.split()
    rare_words = ([word for word in word_count if word_count.count(word) == 1])
    cleaned_text = " ".join(word for word in cleaned_text.split() if word not in rare_words)

    # #stemming of words
    stemmer = PorterStemmer()
    cleaned_text = " ".join(stemmer.stem(word) for word in cleaned_text.split())

    # Save the cleaned and lemmatized text to the output file
    output_txt(output_file, cleaned_text)

In [5]:
input_file = "doc.txt"
output_file = "doc_cleaned.txt"
clean_text(input_file, output_file)