In [2]:

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Sample standardized phrases
standardized_phrases = [
    "Optimal performance",
    "Utilize resources",
    "Enhance productivity",
    "Conduct an analysis",
    "Maintain a high standard",
    "Implement best practices",
    "Ensure compliance",
    "Streamline operations",
    "Foster innovation",
    "Drive growth",
    "Leverage synergies",
    "Demonstrate leadership",
    "Exercise due diligence",
    "Maximize stakeholder value",
    "Prioritize tasks",
    "Facilitate collaboration",
    "Monitor performance metrics",
    "Execute strategies",
    "Gauge effectiveness",
    "Champion change",
]

In [4]:
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nurserik\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nurserik\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text.lower())
    return [word for word in tokens if word.isalnum() and word not in stop_words]

In [6]:
# Calculate cosine similarity between input text and standardized phrases
def calculate_similarity(input_text, standardized_phrases):
    vectorizer = TfidfVectorizer(tokenizer=tokenize_text)
    tfidf_matrix = vectorizer.fit_transform([input_text] + standardized_phrases)
    similarity_scores = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:])
    return similarity_scores[0]

In [7]:
# Find and suggest replacements for non-standard phrases
def suggest_replacements(input_text, standardized_phrases, threshold=0.5):
    similarity_scores = calculate_similarity(input_text, standardized_phrases)
    suggestions = []
    for i, score in enumerate(similarity_scores):
        if score < threshold:
            suggestions.append((input_text, standardized_phrases[i], score))
    return suggestions

In [8]:
input_file_path = "sample_text.txt"

# Read the input text from the file
with open(input_file_path, "r") as file:
    input_text = file.read()

In [9]:
# Analyze the input text and provide suggestions
suggestions = suggest_replacements(input_text, standardized_phrases)
for suggestion in suggestions:
    original_phrase, recommended_phrase, similarity_score = suggestion
    print(f"Original: {original_phrase.strip()}")
    print(f"Recommended: {recommended_phrase}")
    print(f"Similarity Score: {similarity_score:.2f}")
    print()

Original: In today's meeting, we discussed a variety of issues affecting our department. The weather was unusually sunny, a pleasant backdrop to our serious discussions. We came to the consensus that we need to do better in terms of performance. Sally brought doughnuts, which lightened the mood. It's important to make good use of what we have at our disposal. During the coffee break, we talked about the upcoming company picnic. We should aim to be more efficient and look for ways to be more creative in our daily tasks. Growth is essential for our future, but equally important is building strong relationships with our team members. As a reminder, the annual staff survey is due next Friday. Lastly, we agreed that we must take time to look over our plans carefully and consider all angles before moving forward. On a side note, David mentioned that his cat is recovering well from surgery.
Recommended: Optimal performance
Similarity Score: 0.05

Original: In today's meeting, we discussed a v

