In [54]:
!pip install transformers
!pip install spacy
!pip install --upgrade scikit-learn
!pip install nltk



**GET THE LIBRAIRIES**

In [22]:
# Data processing
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer

from tensorflow.keras import preprocessing as kprocessing
from tensorflow.keras import models, layers, optimizers

import transformers

# Visualization
from matplotlib import pyplot as plt

# Text processing
import re
import nltk
import gensim.downloader as api

# Various
from datetime import datetime

**GET THE DATA**

In [23]:
# download the data from the given URLs and store them in dataframes
sample_submission_df = pd.read_csv('sample_submission.csv')
training_data_df = pd.read_csv('training_data.csv')
unlabelled_test_data_df = pd.read_csv('unlabelled_test_data.csv')

training_data_df.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [24]:
print('Total number of news: {}'.format(len(training_data_df)))
print(40*'-')
print('Split by category:')
print(training_data_df["difficulty"].value_counts())
print(40*'-')
nr_categories = len(training_data_df["difficulty"].unique())
print("Number of categories: {n}".format(n=nr_categories))

Total number of news: 4800
----------------------------------------
Split by category:
difficulty
A1    813
C2    807
C1    798
B1    795
A2    795
B2    792
Name: count, dtype: int64
----------------------------------------
Number of categories: 6


In [25]:
# You can adjust n:
n=100
print('Category: ',training_data_df["difficulty"][n])
print(100*'-')
print('Text:')
print(training_data_df['sentence'][n])

Category:  B2
----------------------------------------------------------------------------------------------------
Text:
Que vous cherchiez une simple annonce ou un recruteur, vous êtes sûr de ne rater aucune offre d'emploi grâce à notre système d'alerte personnalisé.


In [27]:
import spacy


# Download French stopwords from NLTK
nltk.download('stopwords')
lst_stopwords = nltk.corpus.stopwords.words("french")

# Load French tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("fr_core_news_sm")

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    # Clean text
    text = str(text).lower()
    text = text.strip()
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize
    lst_text = text.split()

    # Remove stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]

    # Lemmatization with spaCy (if flag is true)
    if flg_lemm == True:
        lst_text = [token.lemma_ for token in nlp(" ".join(lst_text))]

    # Back to string from list
    text = " ".join(lst_text)
    return text

# Apply the function to your DataFrame
training_data_df["text_clean"] = training_data_df["sentence"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))

# Display the DataFrame
training_data_df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dimitriroulin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,sentence,difficulty,text_clean
0,0,Les coûts kilométriques réels peuvent diverger...,C1,coût kilométrique réel pouvoir diverger sensib...
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,bleu cest couleur préférer naim vert
2,2,Le test de niveau en français est sur le site ...,A1,test niveau français site internet lécole
3,3,Est-ce que ton mari est aussi de Boston?,A1,estce mari aussi boston
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,école commerc couloir place financier arriver ...


In [44]:
# Renaming, Input -> X, Output -> y
X = training_data_df['text_clean']
y = training_data_df['difficulty']
# Split into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=training_data_df['difficulty'])

In [45]:
corpus = X_train
# Initizalize the vectorizer with max nr words and ngrams (1: single words, 2: two words in a row)
vectorizer_tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,2))
# Fit the vectorizer to the training data
vectorizer_tfidf.fit(corpus)
TfidfVectorizer(max_features=15000, ngram_range=(1, 2))

In [49]:
classifier_tfidf = LogisticRegression()
model_tfidf = Pipeline([("vectorizer", vectorizer_tfidf), ("classifier", classifier_tfidf)])

start_time = datetime.now()
model_tfidf.fit(X_train, y_train)
end_time = datetime.now()

training_time_tfidf = (end_time - start_time).total_seconds()

In [50]:
predicted_train_tfidf = model_tfidf.predict(X_train)
accuracy_train_tfidf = accuracy_score(y_train, predicted_train_tfidf)
print('Accuracy Training data: {:.1%}'.format(accuracy_train_tfidf))

predicted_test_tfidf = model_tfidf.predict(X_test)
accuracy_test_tfidf = accuracy_score(y_test, predicted_test_tfidf)
accuracy_tfidf = accuracy_test_tfidf
print('Accuracy Test data: {:.1%}'.format(accuracy_test_tfidf))

print('Training time: {:.1f}s'.format(training_time_tfidf))

Accuracy Training data: 95.1%
Accuracy Test data: 40.8%
Training time: 0.6s


In [51]:
print('Classes of the model: ',classifier_tfidf.classes_)
print(80*'-')
print('Shape of the coefficients of the model (categories x vocabulary size): ',classifier_tfidf.coef_.shape)
print(80*'-')
NN = 10
# Get the 10 (here: NN, which you can adjust yourself) ids of the words with highest weights per category
top_words = np.argsort(classifier_tfidf.coef_,axis=1)[:,-NN:]

# Get the vocabulary of the model (mapping of words to ids):
voc = vectorizer_tfidf.vocabulary_
# Get the inverse vocabulary to map the ids of the words to the words:
inv_voc = {v: k for k, v in voc.items()}

# Get for each category (=class) the top ten words
for n, w in enumerate(classifier_tfidf.classes_):
    t = w + ': '
    for i in range(NN):
        t += inv_voc[top_words[n,i]]
        if i!=NN:
            t+=', '
    print(t)
    print(80*'-')

Classes of the model:  ['A1' 'A2' 'B1' 'B2' 'C1' 'C2']
--------------------------------------------------------------------------------
Shape of the coefficients of the model (categories x vocabulary size):  (6, 15000)
--------------------------------------------------------------------------------
A1: mappelle, jaim, mange, estce, jaime, aller, cest, vai, bonjour, merci, 
--------------------------------------------------------------------------------
A2: avant, serveur, fai, venir, lécole, faire, dire, aller, jai, cest, 
--------------------------------------------------------------------------------
B1: trop, france, quoi, ville, exemple, faire, prince, savoir, avoir, quil, 
--------------------------------------------------------------------------------
B2: communication, vers, journaliste, rendre, chaussure, capacité, électronique, forêt, portable, déchet, 
--------------------------------------------------------------------------------
C1: édition, fabrice, vie, an, population, 0

In [56]:
import nltk
nltk.download('stopwords')

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Get French stopwords from NLTK
from nltk.corpus import stopwords
french_stopwords = stopwords.words('french')

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=french_stopwords)),  # Use the custom stopwords list
    ('clf', MultinomialNB()),
])

# Define the parameter space for grid search
parameters = {
    'tfidf__max_df': (0.5, 0.75, 1.0),
    'tfidf__max_features': (None, 5000, 10000),
    'clf__alpha': (0.1, 1, 10),
}

# Perform grid search on the classifier using the defined parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    # Get the best model from the grid search
    best_model = grid_search.best_estimator_

    # Predict the labels of the test set using the best model
    y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy of the model: {:.1%}".format(accuracy))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dimitriroulin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters set:
	clf__alpha: 0.1
Accuracy of the model: 43.7%
	tfidf__max_df: 0.5
Accuracy of the model: 43.7%
	tfidf__max_features: None
Accuracy of the model: 43.7%


In [58]:
from sklearn.svm import SVC

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=french_stopwords)),  # Use the custom stopwords list
    ('clf', SVC()),  # Use SVC instead of MultinomialNB
])

# Define the parameter space for grid search
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__max_features': (None, 5000, 10000, 20000),
    'clf__C': (0.1, 1, 10),  # Add parameters for SVC
    'clf__kernel': ('linear', 'rbf'),  # Add parameters for SVC
}

# Perform grid search on the classifier using the defined parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

 # Calculate the accuracy of the model on the training set
y_train_pred = best_model.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)

# Calculate the accuracy of the model on the test set
y_test_pred = best_model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Accuracy on the training set: {:.1%}".format(accuracy_train))
print("Accuracy on the test set: {:.1%}".format(accuracy_test))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters set:
	clf__C: 10
	clf__kernel: 'rbf'
	tfidf__max_df: 0.25
	tfidf__max_features: None
Accuracy on the training set: 96.0%
Accuracy on the test set: 43.7%


In [62]:
# Renaming, Input -> X, Output -> y
X_tf = training_data_df['sentence']
y_tf_class = training_data_df['difficulty']

# Convert labels into a one-hot vector of size 5 (the number of distinct labels)
lab = LabelBinarizer()
lab.fit(y_tf_class)
y_tf = lab.transform(y_tf_class)

# Example (you can modify n)
n=100
print('Coding of labels into a one-hot vector: ' + y_tf_class[n] + ' is ', y_tf[n])

# Split into training and test data
X_tf_train, X_tf_test, y_tf_train, y_tf_test = train_test_split(X_tf, y_tf, test_size=0.3, random_state=42, stratify=training_data_df['difficulty'])


Coding of labels into a one-hot vector: B2 is  [0 0 0 1 0 0]


In [63]:
corpus = X_tf_train
max_words = 15000
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', num_words=max_words, oov_token="<pad>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(corpus)
voc = tokenizer.word_index
reverse_voc = dict([(value, key) for (key, value) in voc.items()])

In [64]:
max_len = 200

sequences = tokenizer.texts_to_sequences(X_tf_train)
X_tf_train_seq = kprocessing.sequence.pad_sequences(sequences, maxlen=max_len)

# Apply the same to test data
X_tf_test_seq = kprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_tf_test), maxlen=max_len)

n=10 # You can adjust n
print('Shape: ',X_tf_train_seq.shape)
print(100*'-')
print('Example: ',X_tf_train_seq[n,:])

Shape:  (3360, 200)
----------------------------------------------------------------------------------------------------
Example:  [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  