# Dataset (annotated corpus) descriptive analysis

In [1]:
import pandas as pd
import numpy as np

excel1 = "corpus_three_annot.xlsx"
excel2 = "corpus_two_annot.xlsx"
corpus1 = pd.read_excel(excel1)
corpus2 = pd.read_excel(excel2)

## 1. Annotated corpus

### 1. 1. Length & relevance labels

In [2]:
corpus1 = corpus1[["id_segment", 'text_segment', "relevance_label", "relevance_type_norm"]]
print("Lenght corpus 1 : {}".format(len(corpus1.index)))

Lenght corpus 1 : 161


In [3]:
corpus2 = corpus2[["id_segment", 'text_segment', "relevance_label", "relevance_type_norm"]]
print("Lenght corpus 2 : {}".format(len(corpus2.index)))

Lenght corpus 2 : 169


In [4]:
annotated_corpus = pd.concat([corpus1, corpus2])
len(annotated_corpus.index)

330

In [5]:
df_descr = pd.DataFrame(annotated_corpus["relevance_label"].value_counts())
df_descr['freq'] = round(df_descr['relevance_label']*100/sum(df_descr['relevance_label']),1)
df_descr

Unnamed: 0,relevance_label,freq
0,218,66.1
1,71,21.5
2,41,12.4


In [6]:
annotated_corpus["nb_tokens"] = annotated_corpus["text_segment"].apply(lambda x:len(x.split(' ')))

In [7]:
print('Number of tokens per segment: Min : {}, Max : {}, Mean : {}'.format(annotated_corpus["nb_tokens"].min(),
                                             annotated_corpus["nb_tokens"].max(),
                                             round(annotated_corpus["nb_tokens"].mean())))

Number of tokens per segment: Min : 6, Max : 645, Mean : 47


### 1. 2. Topics (relevance_type column)

In [8]:
def topic_present(text, topic):
    text = str(text)
    # Séparer les éléments par des virgules et supprimer les espaces
    sous_elements = [e.strip() for e in text.split(',')]
    # Vérifier si l'élément exact est présent
    if topic in sous_elements:
        return True
    return False

for ent in ['LULCC', 'LULC', 'DRIVERS', 'PRACTICES'] :
    annotated_corpus[ent] = annotated_corpus['relevance_type_norm'].apply(lambda x:topic_present(x,ent))

In [9]:
annotated_corpus[annotated_corpus["relevance_label"].isin([1,2])]["relevance_type_norm"].value_counts()

LULCC                        29
PRACTICES                    22
DRIVERS                      18
DRIVERS, LULCC               11
LULC                          7
DRIVERS, PRACTICES            6
LULCC, DRIVERS                5
LULC, PRACTICES               4
PRACTICES, DRIVERS            3
LULCC, PRACTICES              3
LULCC, DRIVERS, PRACTICES     1
PRACTICES, LULCC              1
DRIVERS, LULC                 1
DRIVERS, LULCC, PRACTICES     1
Name: relevance_type_norm, dtype: int64

In [10]:
n_class1 = len(annotated_corpus[annotated_corpus['relevance_label']==1].index)
n_class2 = len(annotated_corpus[annotated_corpus['relevance_label']==2].index)
n = [n_class1, n_class1, n_class2, n_class2]

In [11]:
n

[71, 71, 41, 41]

In [12]:
LULCC_count = annotated_corpus[annotated_corpus['relevance_label'].isin([1,2])].groupby(['relevance_label', "LULCC"]).size().reset_index(name='count')
LULCC_count['freq'] = round(100*LULCC_count['count']/n,1)
LULCC_count

Unnamed: 0,relevance_label,LULCC,count,freq
0,1,False,45,63.4
1,1,True,26,36.6
2,2,False,16,39.0
3,2,True,25,61.0


In [13]:
LULC_count = annotated_corpus[annotated_corpus['relevance_label'].isin([1,2])].groupby(['relevance_label', "LULC"]).size().reset_index(name='count')
LULC_count['freq'] = round(100*LULC_count['count']/n,1)
LULC_count

Unnamed: 0,relevance_label,LULC,count,freq
0,1,False,64,90.1
1,1,True,7,9.9
2,2,False,36,87.8
3,2,True,5,12.2


In [14]:
DRIVERS_count = annotated_corpus[annotated_corpus['relevance_label'].isin([1,2])].groupby(['relevance_label', "DRIVERS"]).size().reset_index(name='count')
DRIVERS_count['freq'] = round(100*DRIVERS_count['count']/n,1)
DRIVERS_count

Unnamed: 0,relevance_label,DRIVERS,count,freq
0,1,False,38,53.5
1,1,True,33,46.5
2,2,False,28,68.3
3,2,True,13,31.7


In [15]:
PRACTICES_count = annotated_corpus[annotated_corpus['relevance_label'].isin([1,2])].groupby(['relevance_label', "PRACTICES"]).size().reset_index(name='count')
PRACTICES_count['freq'] = round(100*PRACTICES_count['count']/n,1)
PRACTICES_count

Unnamed: 0,relevance_label,PRACTICES,count,freq
0,1,False,48,67.6
1,1,True,23,32.4
2,2,False,23,56.1
3,2,True,18,43.9


## 2. All corpus

In [None]:
all_corpus = pd.read_excel("all_corpus_processed.xlsx")
len(all_corpus.index)

Number of candidates segments (matching with the nomenclature)

In [None]:
len(all_corpus[all_corpus['match'] == 1].index)