In [None]:
!pip freeze > requirements.txt
!pip list --format=freeze > requirements.txt

In [1]:
!pip install datasets
!pip install accelerate -U


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     

In [2]:
!pip install sentencepiece sacremoses

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, sacremoses
Successfully installed sacremoses-0.1.1 sentencepiece-0.1.99


In [3]:
!python -m spacy download fr_core_news_sm

2023-12-19 13:08:12.492803: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-19 13:08:12.492869: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-19 13:08:12.495079: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-19 13:08:12.507298: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-19 13:08:15.888590: I external/local_xla/xla/

# French Text Processing and Classification with TensorFlow and Hugging Face

This notebook demonstrates a comprehensive process of loading, preprocessing, and preparing a French text dataset for a classification task using TensorFlow and Hugging Face's transformers.

## Importing Necessary Libraries
- `pandas`: For data manipulation and analysis.
- `transformers`: Provides access to Hugging Face's CamembertTokenizer and AutoTokenizer for text tokenization.
- `tensorflow`: An open-source library for various machine learning tasks.
- `re`: Regular expression operations for text processing.
- `spacy`: Advanced Natural Language Processing in Python.
- `sklearn.model_selection`: Provides train_test_split function for splitting the dataset.
- `sklearn.preprocessing`: LabelEncoder for encoding labels.

## Loading the Dataset
- The dataset is loaded from a CSV file into a pandas DataFrame.
- File path should be replaced with the path to your CSV file.

## Basic Preprocessing
- The dataset is renamed for clarity, changing 'sentence' to 'text' and 'difficulty' to 'labels'.
- Unnecessary columns like 'id' are dropped.
- A custom function `initial_clean` is defined for basic text cleaning which includes:
    - Lowercasing the text.
    - Removing non-alphabetic characters.
- The `initial_clean` function is applied to the 'text' column.

## Tokenizer and Model Initialization
- Additional libraries such as `numpy` are imported.
- A LabelEncoder instance is created and used to encode the 'labels' column.
- The first few rows of the dataset are displayed using `data.head()` for a quick overview.


In [1]:
import pandas as pd
from transformers import CamembertTokenizer
import tensorflow as tf
import re
import spacy
import pandas as pd
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Load the dataset
file_path = '/content/training_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Basic Preprocessing
# data['sentence'] = data['sentence'].str.lower()  # Lowercasing
# def clean_text(text):
#     text = text.lower()  # Lowercasing
#     text = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", text)  # Remove punctuation and numbers
#     return text

# data['sentence'] = data['sentence'].apply(clean_text)
# nlp = spacy.load("fr_core_news_sm")  # Load French tokenizer, tagger, parser, NER, and word vectors
# data['sentence'] = data['sentence'].apply(lambda x: " ".join([token.lemma_ for token in nlp(x)]))


data = data.rename(columns={'sentence': 'text', 'difficulty': 'labels'}).drop(['id'],axis=1)
def initial_clean(text):
    text = text.lower()
    text = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ]', ' ', text)
    return text

data['text'] = data['text'].apply(initial_clean)

# Initialize tokenizer and model
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

#for dimostration pourposes I inly use 2k data
LE = LabelEncoder()
data['labels'] = LE.fit_transform(data['labels'])
data.head()


Unnamed: 0,text,labels
0,les coûts kilométriques réels peuvent diverger...,4
1,le bleu c est ma couleur préférée mais je n a...,0
2,le test de niveau en français est sur le site ...,0
3,est ce que ton mari est aussi de boston,0
4,dans les écoles de commerce dans les couloirs...,2


In [None]:
data.shape

(4800, 2)

In [None]:
data['text']

0       Les cots kilomtriques rels peuvent diverger se...
1       Le bleu cest ma couleur prfre mais je naime pa...
2       Le test de niveau en franais est sur le site I...
3                  Estce que ton mari est aussi de Boston
4       Dans les coles de commerce dans les couloirs d...
                              ...                        
4795    Cest pourquoi il dcida de remplacer les habitu...
4796    Il avait une de ces pleurs splendides qui donn...
4797    Et le premier samedi de chaque mois venez renc...
4798    Les cots lis  la journalisation ntant pas ngli...
4799    Sur le sable la mer haletait de toute la respi...
Name: text, Length: 4800, dtype: object

In [None]:
data['text'].describe()

count                                                  4800
unique                                                 4778
top       cest pourquoi dcida remplacer habituelles port...
freq                                                      3
Name: text, dtype: object

In [2]:
import spacy

nlp = spacy.load('fr_core_news_sm')  # Load the French model
from datasets import Dataset
from sklearn.model_selection import train_test_split
train_data , val_data= train_test_split(data,test_size=0.2)
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Tokenization of Text Data for Machine Learning Models

In this section, we focus on tokenizing our text data, preparing it for input into machine learning models. Tokenization is a crucial step in text processing, especially when dealing with languages like French.

## Importing the Tokenizer
- We use the `AutoTokenizer` from the Hugging Face `transformers` library.
- `AutoTokenizer` automatically detects and loads the tokenizer that corresponds to the "camembert-base" model.
- The "camembert-base" model is specifically designed for the French language, making it an ideal choice for our dataset.

## Tokenization Function
- We define a function named `tokenize_function`.
- This function takes examples (text data) and returns tokenized output.
- In the tokenization process, we ensure:
    - Padding is applied to each text so that all texts are of the same length, which is crucial for batch processing in neural networks.
    - Truncation is also applied to limit the length of the texts, ensuring consistent and manageable input sizes.

## Applying Tokenization to the Datasets
- We apply the `tokenize_function` to both our training and validation datasets.
- `train_dataset.map(tokenize_function, batched=True)` processes the training dataset.
- `val_dataset.map(tokenize_function, batched=True)` processes the validation dataset.
- The `batched=True` argument in the `map` function ensures that tokenization is applied in batches, making the process more efficient.

This tokenization step is essential for preparing the dataset for training and evaluating language models, ensuring uniformity and consistency in the input data.


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


traintokenized_datasets = train_dataset.map(tokenize_function, batched=True)

valtokenized_datasets = val_dataset.map(tokenize_function, batched=True)

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Map:   0%|          | 0/3840 [00:00<?, ? examples/s]

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
from transformers import AutoModelForSequenceClassification
bmodel = AutoModelForSequenceClassification.from_pretrained("camembert-base", num_labels=6)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments
training_args = TrainingArguments(output_dir="test_trainer")

In [9]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [10]:
import numpy as np
import evaluate
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# Training Setup for Language Model with Custom Metrics

In this part of the notebook, we set up the training configuration for our language model using the Hugging Face `transformers` library. This involves defining a custom metric for evaluation and configuring the training arguments.

## Defining the Compute Metrics Function
- We define a function named `compute_metrics` for evaluating our model.
- This function takes `eval_pred` as input, which contains the model's logits and the true labels.
- It calculates the predictions by applying the `np.argmax` function to the logits, effectively choosing the most likely class.
- Finally, the function returns the computed metrics, which will be based on the predictions and the true labels.

## Setting Up Training Arguments
- We use the `TrainingArguments` class to specify our training configuration.
- `output_dir="test_trainer"`: Specifies the directory where the training outputs (like model checkpoints) will be saved.
- `num_train_epochs=19`: Sets the number of training epochs to 19.
- `evaluation_strategy="epoch"`: Determines that evaluation will be performed at the end of each epoch.

## Initializing the Trainer
- The `Trainer` class is initialized with several key components:
    - `model=bmodel`: The model to be trained, represented here as `bmodel`.
    - `args=training_args`: The training arguments defined earlier.
    - `train_dataset=traintokenized_datasets`: The tokenized training dataset.
    - `eval_dataset=valtokenized_datasets`: The tokenized validation dataset.
    - `compute_metrics=compute_metrics`: The custom metric computation function defined earlier.

This setup is crucial for training our model effectively, ensuring that it is evaluated correctly at each epoch and that the results are stored for further analysis.


In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",num_train_epochs=19, evaluation_strategy="epoch")
trainer = Trainer(
    model=bmodel,
    args=training_args,
    train_dataset=traintokenized_datasets,
    eval_dataset=valtokenized_datasets,
    compute_metrics=compute_metrics,

)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.132426,0.513542
2,1.299000,1.160741,0.4875


TrainOutput(global_step=960, training_loss=1.1244698206583659, metrics={'train_runtime': 768.7389, 'train_samples_per_second': 9.99, 'train_steps_per_second': 1.249, 'total_flos': 2020765477109760.0, 'train_loss': 1.1244698206583659, 'epoch': 2.0})

# Model Saving, Loading and Feature Extraction

In this section of the notebook, we focus on saving the trained model, reloading it for further use, and extracting additional linguistic features from the text data.

## Saving the Trained Model
- We use the `save_model` method of the `Trainer` object to save the fine-tuned model.
- The model is saved to the directory `/content/cambert_french_finetuned/`.
- Saving the model allows us to reuse it later without needing to retrain it.

## Loading the Saved Model
- We use `AutoModelForSequenceClassification` from Hugging Face's `transformers` library to load the saved model.
- The model is loaded from the previously saved directory.
- `num_labels=6` specifies that the model is used for a classification task with 6 different classes.

## Feature Extraction: Part-of-Speech Tagging
- We define a function `get_pos_tags` to extract Part-of-Speech (POS) tags from the text.
- This function uses the `spacy` library to process the text and return POS tags for each token.
- The `get_pos_tags` function is applied to the 'text' column of our DataFrame, creating a new column `pos_tags` that contains the POS tags for each text entry.

## Creating One-Hot Encoded Vectors for POS Tags
- We gather all unique POS tags present in our dataset.
- A one-hot encoding approach is used to represent these POS tags numerically.
- This process involves creating vectors where each vector corresponds to a text entry.
- In each vector, a '1' represents the presence of a specific POS tag, and a '0' represents its absence.
- These one-hot encoded vectors provide a useful feature set for machine learning tasks, enabling models to understand the syntactic structure of the text.

This combination of model saving/loading and feature extraction adds a layer of sophistication to our text processing pipeline, enhancing our model's capabilities for understanding and classifying French text.


In [18]:
trainer.save_model("/content/cambert_french_finetuned/")
from transformers import AutoModelForSequenceClassification

bmodel = AutoModelForSequenceClassification.from_pretrained("/content/cambert_french_finetuned/", num_labels=6)
def get_pos_tags(text):
    doc = nlp(text)
    return [token.pos_ for token in doc]

# Apply the function to create a new column with POS tags
data['pos_tags'] = data['text'].apply(get_pos_tags)

# Get unique POS tags in the entire DataFrame
all_pos_tags = set(tag for tags in data['pos_tags'] for tag in tags)

# Create a one-hot encoded matrix for POS tags
one_hot_vectors = []
for tags in data['pos_tags']:
    vector = [1 if pos_tag in tags else 0 for pos_tag in all_pos_tags]
    one_hot_vectors.append(vector)


In [19]:

# Convert the list of one-hot vectors into a DataFrame
pos_tags_df = pd.DataFrame(one_hot_vectors, columns=list(all_pos_tags))

# Concatenate the POS tags DataFrame with your original DataFrame
df = pd.concat([data, pos_tags_df], axis=1)


In [20]:
bmodel.cuda()

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

# Generating BERT Embeddings for Text Analysis

This section of the notebook demonstrates how to generate BERT embeddings for text data, an essential step in leveraging the power of pre-trained language models for advanced text analysis.

## Function to Get BERT Embeddings
- We define a function `get_bert_embeddings` which takes a text input and returns its BERT embeddings.
- The function uses the `tokenizer` to tokenize and encode the text input.
    - `return_tensors='pt'` specifies that the output will be PyTorch tensors.
    - Padding and truncation are applied to handle texts of varying lengths.
- The encoded input is then moved to a CUDA device (GPU) for faster processing.
- The model (`bmodel`) processes the encoded text without gradient calculations (`torch.no_grad()`) for efficiency.
- The function extracts the last hidden states from the model's output and applies mean pooling.
- The pooled output is then moved back to the CPU and converted to a NumPy array for compatibility with other Python libraries like scikit-learn.

## Batch Processing of Text Data
- We divide our DataFrame's text data into batches of size 8 using list comprehension.
- For each batch, the `get_bert_embeddings` function is called to generate embeddings.
- These embeddings are collected in a list.

## Concatenating Embeddings
- After processing all batches, we concatenate the embeddings into a single NumPy array.
- This array can be used as input for various machine learning tasks, providing a rich representation of the text data.

## Assigning Embeddings to DataFrame
- The final embeddings can be assigned to the DataFrame for further analysis or modeling.
- This step is currently commented out but can be activated by removing the comment mark.

By converting text data into BERT embeddings, we can capture the contextual nuances of language, which enhances the performance of machine learning models on tasks like classification, sentiment analysis, or feature extraction.


In [22]:
import torch
def get_bert_embeddings(text):
    # Tokenize and encode the text
    encoded_input =tokenizer(text, return_tensors='pt',padding="max_length", max_length=512,truncation=True)

    # Move encoded input to the device
    encoded_input = {key: value.to('cuda') for key, value in encoded_input.items()}

    # Get model output and extract the last hidden states
    with torch.no_grad():
        output = bmodel(**encoded_input,output_hidden_states=True)
        # print(output.keys())
    # Mean pooling
    return output.hidden_states[-1].mean(dim=1).squeeze().cpu().numpy()  # Move to CPU for compatibility with scikit-learn

batch_size=8
text_batches = [df['text'][i:i + batch_size] for i in range(0, len(df), batch_size)]

# Process batches and concatenate the results
embeddings = []
for batch in text_batches:

    embeddings.append(get_bert_embeddings(list(batch)))

# Concatenate the embeddings
embeddings = np.concatenate(embeddings)

# Assign the embeddings to the DataFrame
# df['embed'] = embeddings

In [23]:
flattened_embeddings = embeddings.reshape((len(df), -1))

# Assign the embeddings to the DataFrame
for i in range(flattened_embeddings.shape[1]):
    df[f'embed_{i}'] = flattened_embeddings[:, i]

  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embedding

In [24]:
df.head()

Unnamed: 0,text,labels,pos_tags,SCONJ,DET,ADJ,AUX,SPACE,ADP,NUM,...,embed_758,embed_759,embed_760,embed_761,embed_762,embed_763,embed_764,embed_765,embed_766,embed_767
0,les coûts kilométriques réels peuvent diverger...,4,"[DET, NOUN, ADJ, ADJ, VERB, VERB, ADV, ADP, NO...",0,1,1,0,1,1,0,...,0.005726,0.039322,0.181342,-0.044234,-0.055682,0.059646,-0.015783,0.009477,0.088292,-0.067678
1,le bleu c est ma couleur préférée mais je n a...,0,"[DET, NOUN, SPACE, NOUN, AUX, DET, NOUN, VERB,...",0,1,0,1,1,0,0,...,0.02994,0.082247,-0.003615,0.164349,-0.091955,0.004219,0.170935,-0.051858,-0.001645,-0.059087
2,le test de niveau en français est sur le site ...,0,"[DET, NOUN, ADP, NOUN, ADP, NOUN, VERB, ADP, D...",0,1,0,0,0,1,0,...,0.043831,0.080496,-0.01424,0.158142,-0.03404,-0.093645,0.092579,-0.008584,0.054116,-0.055213
3,est ce que ton mari est aussi de boston,0,"[AUX, PRON, SCONJ, NOUN, NOUN, VERB, ADV, ADP,...",1,0,0,1,0,1,0,...,0.022245,0.095056,0.03593,0.153097,-0.100794,0.005093,0.158731,-0.061195,-0.000543,-0.077103
4,dans les écoles de commerce dans les couloirs...,2,"[ADP, DET, NOUN, ADP, NOUN, SPACE, ADP, DET, N...",0,1,1,0,1,1,0,...,-0.099563,-0.117106,-0.050194,0.019643,0.200286,0.01051,-0.151685,-0.022228,-0.099599,0.15301


In [25]:
# Sentence complexity features
def sentence_features(text):
    words = text.split()
    return len(words), sum(len(word) for word in words) / len(words) if words else 0

df['num_words'], df['avg_word_length'] = zip(*df['text'].apply(sentence_features))
# Convert numerical features to string

  df['num_words'], df['avg_word_length'] = zip(*df['text'].apply(sentence_features))
  df['num_words'], df['avg_word_length'] = zip(*df['text'].apply(sentence_features))


In [26]:
df.head()

Unnamed: 0,text,labels,pos_tags,SCONJ,DET,ADJ,AUX,SPACE,ADP,NUM,...,embed_760,embed_761,embed_762,embed_763,embed_764,embed_765,embed_766,embed_767,num_words,avg_word_length
0,les coûts kilométriques réels peuvent diverger...,4,"[DET, NOUN, ADJ, ADJ, VERB, VERB, ADV, ADP, NO...",0,1,1,0,1,1,0,...,0.181342,-0.044234,-0.055682,0.059646,-0.015783,0.009477,0.088292,-0.067678,40,5.25
1,le bleu c est ma couleur préférée mais je n a...,0,"[DET, NOUN, SPACE, NOUN, AUX, DET, NOUN, VERB,...",0,1,0,1,1,0,0,...,-0.003615,0.164349,-0.091955,0.004219,0.170935,-0.051858,-0.001645,-0.059087,14,3.357143
2,le test de niveau en français est sur le site ...,0,"[DET, NOUN, ADP, NOUN, ADP, NOUN, VERB, ADP, D...",0,1,0,0,0,1,0,...,-0.01424,0.158142,-0.03404,-0.093645,0.092579,-0.008584,0.054116,-0.055213,14,3.714286
3,est ce que ton mari est aussi de boston,0,"[AUX, PRON, SCONJ, NOUN, NOUN, VERB, ADV, ADP,...",1,0,0,1,0,1,0,...,0.03593,0.153097,-0.100794,0.005093,0.158731,-0.061195,-0.000543,-0.077103,9,3.444444
4,dans les écoles de commerce dans les couloirs...,2,"[ADP, DET, NOUN, ADP, NOUN, SPACE, ADP, DET, N...",0,1,1,0,1,1,0,...,-0.050194,0.019643,0.200286,0.01051,-0.151685,-0.022228,-0.099599,0.15301,36,4.527778


In [27]:
X=df.drop(['text','pos_tags','labels'],axis=1)
y=df['labels']

In [None]:
X.head()

Unnamed: 0,PRON,NUM,ADV,NOUN,ADP,PUNCT,CCONJ,ADJ,PROPN,SCONJ,...,embed_760,embed_761,embed_762,embed_763,embed_764,embed_765,embed_766,embed_767,num_words,avg_word_length
0,0,0,1,1,1,1,1,1,0,0,...,-0.045885,-0.047388,-0.173737,-0.153386,-0.046336,0.100206,-0.222565,0.11575,38,5.736842
1,1,0,1,1,0,1,1,0,1,0,...,-0.215935,0.330025,0.270201,-0.007197,0.046525,-0.050677,-0.007611,0.120119,12,4.25
2,0,0,0,1,1,1,0,0,0,0,...,-0.24028,0.313979,0.246006,-0.045984,0.03535,-0.048163,-0.028361,0.09254,13,4.153846
3,1,0,1,1,1,1,0,0,1,1,...,-0.210785,0.302628,0.280659,-0.038862,0.044026,-0.051162,-0.008161,0.122888,8,4.125
4,1,1,1,1,1,1,1,1,0,0,...,0.169169,0.033796,0.200843,0.081495,0.103755,-0.025925,0.183891,-0.027395,34,5.176471


In [28]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# prompt: min max scaling of X_train

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [30]:
import pandas as pd
import numpy as np
import re
import torch
import spacy
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.svm import SVC


In [31]:
X_train_r = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_r = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Building and Training a Convolutional Neural Network with Keras

This section demonstrates the process of building a convolutional neural network (CNN) for text classification using the Keras library. We also set up callbacks for effective training.

## Importing Keras Modules
- We import various classes from Keras, including layers for building the model and tools for compiling and training it.

## Setting Up Callbacks
- `EarlyStopping`: Stops training when the accuracy metric (`acc`) stops improving, with a patience of 20 epochs.
- `ModelCheckpoint`: Saves the best model based on the validation loss (`val_loss`) to the file `movie_sentiment_m1.h5`.
- `ReduceLROnPlateau`: Reduces the learning rate when a metric has stopped improving, with a factor of 0.1 and a patience of 1 epoch.

## Building the Model
- The model is built using the Keras Functional API.
- `Input` layer: Specifies the shape of the input data.
- `Conv1D` layers: 1D convolutions with 256 filters and a kernel size of 3, using ReLU activation.
- `MaxPooling1D` layers: Reduce the dimensionality of the output from the Conv1D layers.
- `GlobalMaxPooling1D`: Applies global max pooling to the final convolutional layer.
- `Dense` layers: Fully connected layers for classification. The last Dense layer uses softmax activation for multi-class classification.
- The model architecture is summarized using `model.summary()`.

## Compiling the Model
- The model is compiled with the RMSprop optimizer, a learning rate of 0.001, categorical crossentropy loss, and accuracy as the metric.

## Training Configuration
- The training process will utilize the callbacks defined earlier for early stopping, model checkpointing, and learning rate reduction.

The model constructed here is specifically designed for text classification tasks, leveraging the strengths of convolutional layers for feature extraction from text data. This setup is suitable for multi-class classification problems, like sentiment analysis.


In [32]:
import keras
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras import Model, layers
from keras import Input

from keras.optimizers import RMSprop

callback_list = [
    keras.callbacks.EarlyStopping(
        patience=20,
        monitor='acc',
    ),

    keras.callbacks.ModelCheckpoint(
        monitor='val_loss',
        save_best_only=True,
        filepath='model/movie_sentiment_m1.h5',
    ),

    keras.callbacks.ReduceLROnPlateau(
        patience=1,
        factor=0.1,
    )
]

# layer developing
text_input_layer = Input(shape=(X_train_r.shape[1],X_train_r.shape[2],))
# embedding_layer = Embedding(X_train.shape[1], )(text_input_layer)
text_layer = Conv1D(256, 3, activation='relu')(text_input_layer)
text_layer = MaxPooling1D(3)(text_layer)
text_layer = Conv1D(256, 3, activation='relu')(text_layer)
text_layer = MaxPooling1D(3)(text_layer)
text_layer = Conv1D(256, 3, activation='relu')(text_layer)
text_layer = MaxPooling1D(3)(text_layer)
text_layer = Conv1D(256, 3, activation='relu')(text_layer)
text_layer = MaxPooling1D(3)(text_layer)
text_layer = Conv1D(256, 3, activation='relu')(text_layer)
text_layer = MaxPooling1D(3)(text_layer)
text_layer = GlobalMaxPooling1D()(text_layer)
text_layer = Dense(256, activation='relu')(text_layer)
output_layer = Dense(6, activation='softmax')(text_layer)
model = Model(text_input_layer, output_layer)
model.summary()
model.compile(optimizer=RMSprop(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['acc'])

# multi-input test

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 787, 1)]          0         
                                                                 
 conv1d (Conv1D)             (None, 785, 256)          1024      
                                                                 
 max_pooling1d (MaxPooling1  (None, 261, 256)          0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 259, 256)          196864    
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 86, 256)           0         
 g1D)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 84, 256)           196864

In [33]:
from sklearn.preprocessing import OneHotEncoder

# Assuming pos_tags is a list of POS tags for multiple sentences
encoder = OneHotEncoder(sparse=False)
y_train=encoder.fit_transform(y_train.values.reshape(-1,1))

y_test=encoder.transform(y_test.values.reshape(-1,1))



In [34]:

history = model.fit(X_train_r, y_train, epochs=50, batch_size=32, callbacks=callback_list,
                    validation_data=(X_test_r, y_test))


Epoch 1/5
Epoch 2/5
  1/120 [..............................] - ETA: 1s - loss: 1.3848 - acc: 0.2812

  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
# Model evaluation
y_pred = model.predict(X_test_r)
print(classification_report(encoder.inverse_transform(y_test),encoder.inverse_transform( y_pred)))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       166
           1       0.64      0.55      0.59       158
           2       0.64      0.72      0.68       166
           3       0.71      0.39      0.50       153
           4       0.50      0.69      0.58       152
           5       0.75      0.77      0.76       165

    accuracy                           0.67       960
   macro avg       0.68      0.66      0.66       960
weighted avg       0.68      0.67      0.66       960



In [36]:
model.save('my_model.keras')

In [37]:
df=pd.read_csv('https://raw.githubusercontent.com/DalipiDenis/assign/main/unlabelled_test_data.csv')

In [38]:
df = df.rename(columns={'sentence': 'text', 'difficulty': 'labels'})
df['text'] = df['text'].apply(initial_clean)


# Extracting and Encoding Part-of-Speech Tags

This section of the notebook demonstrates the process of extracting Part-of-Speech (POS) tags from text data and encoding these tags in a one-hot encoded format, which is a common approach in natural language processing (NLP).

## Defining the POS Tag Extraction Function
- A function `get_pos_tags` is defined to extract POS tags from a given text.
- The function utilizes the `spacy` NLP library to process the text.
- For each token in the text, the function retrieves its POS tag.
- The output is a list of POS tags corresponding to each token in the input text.

## Applying the Function to DataFrame
- The `get_pos_tags` function is applied to each row of the 'text' column in the DataFrame `df`.
- This creates a new column, `pos_tags`, in `df`, where each row contains the POS tags for the corresponding text.

## Extracting Unique POS Tags
- We extract all unique POS tags present across the entire DataFrame.
- This is achieved by creating a set of tags, aggregating tags from all rows in the `pos_tags` column.

## Creating One-Hot Encoded Vectors
- A one-hot encoding approach is used to numerically represent the POS tags.
- For each set of tags in `df['pos_tags']`, a vector is created.
- In each vector, '1' indicates the presence and '0' indicates the absence of a particular POS tag from the set of all unique POS tags.
- These vectors are appended to the list `one_hot_vectors`.

This process of extracting and encoding POS tags is valuable in NLP applications as it converts textual data into a numerical format that can be used for various machine learning models, providing syntactic information about the text.


In [39]:
def get_pos_tags(text):
    doc = nlp(text)
    return [token.pos_ for token in doc]

# Apply the function to create a new column with POS tags
df['pos_tags'] = df['text'].apply(get_pos_tags)

# Get unique POS tags in the entire DataFrame
all_pos_tags = set(tag for tags in data['pos_tags'] for tag in tags)

# Create a one-hot encoded matrix for POS tags
one_hot_vectors = []
for tags in df['pos_tags']:
    vector = [1 if pos_tag in tags else 0 for pos_tag in all_pos_tags]
    one_hot_vectors.append(vector)


In [None]:
all_pos_tags

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SPACE',
 'SYM',
 'VERB',
 'X'}

# Integrating POS Tags and BERT Embeddings into DataFrame

In this part of the notebook, we focus on incorporating the one-hot encoded Part-of-Speech (POS) tags and BERT embeddings into our main DataFrame for enhanced text analysis and feature representation.

## Converting One-Hot Vectors to DataFrame
- We convert the list of one-hot vectors (`one_hot_vectors`) into a pandas DataFrame, `pos_tags_df`.
- The columns of this DataFrame are named after the unique POS tags.
- This conversion facilitates easier manipulation and integration of the POS tag data.

## Concatenating POS Tags DataFrame with Original DataFrame
- The `pos_tags_df` is concatenated with the original DataFrame `df`.
- The concatenation is done along the columns (`axis=1`), adding the POS tag data as new columns.

## Batch Processing for BERT Embeddings
- We divide the text data from `df` into batches of size 8.
- For each batch, we generate BERT embeddings using the previously defined `get_bert_embeddings` function.
- These embeddings are collected in a list.

## Concatenating and Flattening Embeddings
- After processing all text batches, the embeddings are concatenated into a single array.
- The concatenated embeddings are then reshaped (flattened) to match the number of rows in `df`.

## Assigning Flattened Embeddings to DataFrame
- Each element of the flattened embeddings array is assigned to a new column in `df`.
- These new columns are named `embed_0`, `embed_1`, etc., representing each dimension of the embeddings.

By integrating both the POS tags and BERT embeddings into the DataFrame, we enrich our dataset with both syntactic and semantic features. This comprehensive feature set is crucial for advanced text analysis and machine learning tasks, enabling more nuanced and accurate modeling.


In [42]:

# Convert the list of one-hot vectors into a DataFrame
pos_tags_df = pd.DataFrame(one_hot_vectors, columns=list(all_pos_tags))

# Concatenate the POS tags DataFrame with your original DataFrame
df = pd.concat([df, pos_tags_df], axis=1)



batch_size=8

text_batches = [df['text'][i:i + batch_size] for i in range(0, len(df), batch_size)]

# Process batches and concatenate the results
embeddings = []
for batch in text_batches:

    embeddings.append(get_bert_embeddings(list(batch)))

# Concatenate the embeddings
embeddings = np.concatenate(embeddings)

# Assign the embeddings to the DataFrame
# df['embed'] = embeddings

flattened_embeddings = embeddings.reshape((len(df), -1))

# Assign the embeddings to the DataFrame
for i in range(flattened_embeddings.shape[1]):
    df[f'embed_{i}'] = flattened_embeddings[:, i]

  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embeddings[:, i]
  df[f'embed_{i}'] = flattened_embedding

In [43]:
# Sentence complexity features
def sentence_features(text):
    words = text.split()
    return len(words), sum(len(word) for word in words) / len(words) if words else 0

df['num_words'], df['avg_word_length'] = zip(*df['text'].apply(sentence_features))



  df['num_words'], df['avg_word_length'] = zip(*df['text'].apply(sentence_features))
  df['num_words'], df['avg_word_length'] = zip(*df['text'].apply(sentence_features))


In [None]:
df.columns

Index(['id', 'text', 'embed_0', 'embed_1', 'embed_2', 'embed_3', 'embed_4',
       'embed_5', 'embed_6', 'embed_7',
       ...
       'embed_762', 'embed_763', 'embed_764', 'embed_765', 'embed_766',
       'embed_767', 'num_words', 'avg_word_length', 'num_words_str',
       'avg_word_length_str'],
      dtype='object', length=774)

In [45]:
X=df.drop(['text','pos_tags','id'],axis=1)


In [46]:
X.columns


Index(['SCONJ', 'DET', 'ADJ', 'AUX', 'SPACE', 'ADP', 'NUM', 'NOUN', 'SYM',
       'ADV',
       ...
       'embed_760', 'embed_761', 'embed_762', 'embed_763', 'embed_764',
       'embed_765', 'embed_766', 'embed_767', 'num_words', 'avg_word_length'],
      dtype='object', length=787)

In [47]:
X_val = scaler.transform(X)
X_val_r = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))

In [48]:
y_val = model.predict(X_val_r)



In [None]:
y_val

array([5, 2, 2, ..., 5, 3, 3])

In [49]:
import joblib
joblib.dump(scaler, 'minmax_scaler.pkl')

['minmax_scaler.pkl']

In [50]:
submit_df=pd.read_csv('https://raw.githubusercontent.com/DalipiDenis/assign/main/unlabelled_test_data.csv')

In [51]:
submit_df['difficulty']=LE.inverse_transform(encoder.inverse_transform(y_val))

  y = column_or_1d(y, warn=True)


In [52]:
joblib.dump(encoder, 'one_hot_encoder.pkl')
joblib.dump(LE, 'label_encoder.pkl')


['label_encoder.pkl']

In [53]:
encoder.inverse_transform(y_val)

array([[5],
       [2],
       [2],
       ...,
       [5],
       [4],
       [3]])

In [54]:
submit_df

Unnamed: 0,id,sentence,difficulty
0,0,Nous dûmes nous excuser des propos que nous eû...,C2
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...,B1
2,2,"Et, paradoxalement, boire froid n'est pas la b...",B1
3,3,"Ce n'est pas étonnant, car c'est une saison my...",B1
4,4,"Le corps de Golo lui-même, d'une essence aussi...",C2
...,...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...,B1
1196,1196,Je vais parler au serveur et voir si on peut d...,A2
1197,1197,Il n'était pas comme tant de gens qui par pare...,C2
1198,1198,Ils deviennent dangereux pour notre économie.,C1


In [55]:
submit_df.drop(['sentence'],axis=1,inplace=True)
submit_df.to_csv('submission_3.csv',index=False)