In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import numpy as np
import re
import time
from re import *
import spacy
from sklearn.decomposition import LatentDirichletAllocation

%matplotlib inline

def get_text_segments(text: str):
    '''
    Extracts text segments from the given text by removing special characters and
    splitting the text at punctuation marks.
    
    Args:
        text (str): The input text.
    
    Returns:
        list: A list of cleaned text segments.
    '''
    
    # If there is no content, return empty list
    if isinstance(text, float):
        return []
    
    # Convert text to lowercase and normalize
    text_ = text.lower()
    
    # Clean special signs and normalize
    text_ = re.sub('[!-&\(-+/<=>@{-¿\[-`÷€]', '', text_)
    text_ = re.sub("[’'\-]", ' ', text_)
    
    # Replace multiple spaces
    text_ = re.sub('[ ]+', ' ', text_)

    # Split the text at each punctuation sign
    segments = re.findall('([^!.,;:?«»"\(\)–—]+)', text_)

    # Remove spaces at start and end of the string
    segments = [seg.strip() for seg in segments]

    # Remove empty segments
    segment_length = [len(seg) for seg in segments]
    segments = np.array(segments)[np.array(segment_length) > 0]
    
    return segments

def unroll(nested_list: list):
    '''
    Small function used to flatten a nested list.
    
    Args:
        nested_list (list): The input nested list.
    
    Returns:
        list: A flattened list.
    '''
    return [item for list_ in nested_list for item in list_]

def get_ngrams(text, n: int = 2):
    '''
    Extracts n-grams from the given text.
    
    Args:
        text (str): The input text.
        n (int): The size of the n-grams.
    
    Returns:
        list: A list of n-grams.
    '''
    
    # Compute speech segments (e.g. sentences)
    segments = get_text_segments(text)
    
    # For each segment, iterate over the text and store the ngrams as tuples
    ngrams = []
    for segment in segments:
        tokens = segment.split(' ')
        ngrams.append([tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
        
    return unroll(ngrams)

def count_ngrams(df, col: str = 'content', n: int = 1):
    '''
    Counts all the occurrences of n-grams in a DataFrame.
    
    Args:
        df (DataFrame): The input DataFrame.
        col (str): The column name containing the text.
        n (int): The length n of the n-grams.
    
    Returns:
        DataFrame: A DataFrame containing unique n-grams and their counts.
    '''
    
    # Initialize timer
    start = time.time()
    
    # Recover all n-grams and store them in a new column 
    df[f'{n}-gram'] = df['content'].apply(get_ngrams, n=n)
    
    # Find all unique n-grams and count the number of occurences
    ngram_list = unroll(df[f'{n}-gram'].values.tolist())
    unq, cnt = np.unique(ngram_list, return_counts=True, axis=0)
    print(f'{len(ngram_list)} {n}-grams found in total, corresponding to {len(unq)} unique {n}-grams.')
    
    # Storing the results in a dataframe
    unq = [tuple(it) for it in unq]
    df_ngram = pd.DataFrame({'unique': unq, 'count': cnt}).sort_values(by='count', ascending=False)

    print('Time elapsed:', time.time()-start)
    
    return df_ngram

def count_ngram_occurrences(df, ngram: tuple):
    '''
    Count the occurrences of the given ngram in a dataframe and group the counts by year.
    
    Args:
        df (DataFrame): The input dataframe containing at least n-gram columns and a 'year' column.
        ngram (tuple): The n-gram tuple to be counted in the dataframe.
        
    Returns:
        DataFrame: A new dataframe with n-gram occurrences grouped by year.
    '''
    
    def count_ngram(ngrams, ngram):
        '''
        Counts the number of times where the given ngram appears in a list of ngrams.
        '''
        vector = np.empty(len(ngrams), dtype=object)
        vector[:] = ngrams

        return np.sum(vector == ngram)
    
    if isinstance(ngram, str):
        len_ngram = 1
        ngram = (ngram,)
    else:
        len_ngram = len(ngram)
    
    # Create a numpy array with the ngram to count
    item = np.empty(1, dtype=object)
    item[:] = [ngram]

    # Count the number of occurences of the ngram (item) in each article
    df['occurences'] = df[f'{len_ngram}-gram'].apply(count_ngram, ngram=item)

    # Store the output in a new dataframe grouped by year
    df_ngram_by_year = df[['occurences', 'year']].groupby(by='year').sum()
    df_ngram_by_year = df_ngram_by_year.rename(columns = {'occurences': str(ngram)})

    # Reset df dataframe
    df = df.drop(columns = ['occurences'])
    
    return df_ngram_by_year

In [None]:
print(" /!\ il faut définir l import impresso à utiliser /!\ ")

In [None]:
nlp_fr = spacy.load('fr_core_news_lg')
df = pd.read_csv('.\export-2023-05-02-10429\guisan.csv', sep=';')
articles = df['content'].dropna().values.tolist()[:500]

documents, tokens = [], []
for article in articles:
    words = findall('([a-zà-ÿ]+)', article.lower())
    tokens += words
    documents.append(words)

vocabulaire = pd.Series(tokens).value_counts()[20:]
vocabulaire = vocabulaire [vocabulaire >= 3].sort_index().keys()

one_hot = np.zeros((len(documents), len(vocabulaire)))
for index, document in enumerate(documents):
    for token in document:
        one_hot[index][int(np.argmax(vocabulaire == token))] = 1

lda = LatentDirichletAllocation(n_components=7, random_state=0) # il faut jouer sur le nombre de clusters mais jsp comment
res = lda.fit_transform(one_hot)

# Mots les plus fréquents du cluster 5
for i in np.flip(np.argsort(lda.components_[5]))[:20]:
    print(vocabulaire[i], end=', ')

# Mots les plus caractéristiques du cluster 5
type_attribution = lda.transform(np.identity(len(vocabulaire)))
for t in vocabulaire[np.flip(np.argsort(type_attribution[:,5]))[:20]]:
    print(t, end=', ')

#les articlesles plus represenatifs
for art_id in np.flip(np.argsort(res[:, 5]))[:3]:
    print(articles[art_id], end='\n\n')

data_ngrams = {}

for n in range(1, 6):
    data_ngrams[n] = count_ngrams(df, n=n)


data_ngrams[3].head(30)


In [None]:
data_ngrams[5].head(30)

In [None]:

data_ngrams[4].head(30)

In [None]:
print(" /!\il faut définir le n-gram à utiliser /!\ ")

In [None]:

# Observe and display the temporal evolution of a specific ngram
ngram = ('commandant', 'du', '1', 'er', 'corps')
df_ngram_by_year = count_ngram_occurrences(df, ngram)

# Customize the appearance of the plot
fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
df_ngram_by_year.plot(ax=ax, linestyle='-')

# Add labels, a title, gridlines, and a legend
ax.set_xlabel('Années', fontsize=14)
ax.set_ylabel('Occurrences', fontsize=14)
ax.set_title(f'Evolution temporelle par n-gram', fontsize=16)
ax.legend([f'{" ".join(ngram)}'], loc='best', fontsize=12)

# Format the y-axis ticks
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))

# Show the plot
plt.show()

In [None]:
#find sentences containing the ngram and the sentence before and after
def find_sentences(articles, ngram: tuple):
    W=[]
    for article in articles:
        if " ".join(ngram) in article.lower():
            print(articles.index(article))
            print(article + '\n\n')
            words = re.findall(r'[^.?!]*\. '+r'[^.?!]*\ ' +' '.join(ngram)+r' [^.?!]*\. '+r'[^.?!]*\.', article.lower())
            if words==[]:
                words=re.findall(r'[^.?!]*\. '+r'[^.?!]*\ ' +' '.join(ngram)+r' [^.?!]*\.', article.lower()) #si c'est dans une dernière phrase
                if words==[]:
                    words=re.findall(r'[^.?!]*\ ' +' '.join(ngram)+r' [^.?!]*\. '+r'[^.?!]*\.', article.lower())#si c'est dans une première phrase
                    if words==[]:
                        words=re.findall(' '.join(ngram)+r' [^.?!]*\. '+r'[^.?!]*\.', article.lower())#si c est le debut d'une première phrase
            
            for word in words:
               index=article.lower().index(word)
            W.append(article[index: index+len(word)])
    return W

articles = df['content'].dropna().values.tolist()
seg=find_sentences(articles, ('commandant', 'du', '1', 'er', 'corps'))
print(seg)