First, import the libraries.

In [None]:
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import os
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from collections import Counter

Because of recent feedback, this section is meant to filter out certain words that the STOP_WORDS function in spacy does not handle. Feel free to add any words to this list that you would like. This model was built on the idea of using the lemma of words to count frequency. The lemma of a word is essentially the base of the word. 

Currently, the code is set to lemmatize the text, so infinite verb forms are used. 

In [None]:
# Common French nouns and modified verbs to exclude from keywords
# You can add or delete anything between the {} , making sure to enclose it in quotes ""
COMMON_FRENCH_WORDS = {
    "euh", "alors", "ben", "bon", "bah", "voilà", "tu vois", "enfin", "quoi",
    "c'est-à-dire", "hein", "genre", "tu sais", "bah oui", "eh bien", "d'accord",
    "ah bon", "écoute", "tu comprends", "en fait", "jour", "nuit", "heure",
    "temps", "chose", "année", "moment", "question", "réponse", "vraiment", "vois",
    "lieu", "été", "faire", "bien", "euh", "non", "oui", "aller", "passer"
}

In [None]:
def extract_keywords(file_path, num_keywords=50):
    # Load French language model
    nlp = spacy.load("fr_core_news_sm")
    
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return {}
    
    # Read file content
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    
    # Process the text with spaCy
    doc = nlp(text)
    
    # Tokenization and filtering
    lemmatized_words = [
        token.lemma_.lower() for token in doc 
        if token.is_alpha and token.text.lower() not in STOP_WORDS and token.text.lower() not in COMMON_FRENCH_WORDS
    ]
    
    # Calculate word frequencies using the Counter function
    word_freq = Counter(lemmatized_words)
    
    # Get the top 'N' keywords
    keywords = word_freq.most_common(num_keywords)
    
    return dict(keywords)

In [None]:
def visualize_word_cloud(keywords):
    # Change the colors of the word cloud however you like
    # Examples include: red, orange, skyblue, black, lightgreen, etc.
    # You can reference the documentation for word cloud for all possible colors.
    colors = LinearSegmentedColormap.from_list(
        "Colors", ["green", "beige", "darkgreen"], N=256
    )
    
    # This part generates the word cloud using the extracted keywords.
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='orange',
        colormap=colors
    ).generate_from_frequencies(keywords)
    
    # Display the word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Word Cloud of Extracted Keywords")
    plt.show()

def visualize_bar_chart(keywords):
    # Extract words and their frequencies
    words = list(keywords.keys())
    freqs = list(keywords.values())
    
    # Create the bar chart
    plt.figure(figsize=(10, 6))
    plt.barh(words, freqs, color='blue')
    plt.xlabel('Frequency')
    plt.ylabel('Keywords')
    plt.title('Top Keywords Extracted')
    plt.gca().invert_yaxis()  # Invert y-axis for better readability
    plt.show()

# This part is for the file importation
if __name__ == "__main__":
    # Change this file name to any 
    file_name = "interview_responses_new.txt"
    keyword_frequencies = extract_keywords(file_name, num_keywords=15)
    if keyword_frequencies:
        print("Top Keywords:", list(keyword_frequencies.keys()))
        
        # Visualize the keywords with a word cloud
        visualize_word_cloud(keyword_frequencies)
        
        # Visualize the keywords with a bar chart
        visualize_bar_chart(keyword_frequencies)
