### Context

For text classification with difficulty, we are going to train our model with the logistic regression classification. But, we are going to set different parameters to define which model give the best accuracy.

In [2]:
#Import the packages we need
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns
import spacy
from spacy import displacy
sns.set_style("whitegrid")

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn. preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

In [3]:
#Load the dataset using google drive mount (inspired by the challenge notebook)
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# install Kaggle
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Coding_Challenge/kaggle.json ~/.kaggle/kaggle.json

cp: cannot create regular file '/root/.kaggle/kaggle.json': No such file or directory


In [6]:
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2022

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.8/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.8/dist-packages/kaggle/api/kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [7]:
#let's load the dataset

df = pd.read_csv('drive/MyDrive/Coding_Challenge/training_data.csv')

In [8]:
df_pred = pd.read_csv('drive/MyDrive/Coding_Challenge/unlabelled_test_data.csv')

In [9]:
#splitting the data
X = df['sentence']
y = df['difficulty']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [10]:
#Define a model AND apply vectorizer
LR = LogisticRegression(penalty='l2', dual=False, tol=0.001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='newton-cg', max_iter=1000, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

In [11]:
# Using default tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,1))
# Create a pipeline
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', LR)])

# Fit model on training set
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier',
                 LogisticRegression(max_iter=1000, solver='newton-cg',
                                    tol=0.001))])

In [12]:
#function to calculate metrics of the models
def models_metrics (true, pred):
  precision = precision_score(true, pred, average='weighted')
  recall = recall_score(true, pred, average='weighted')
  f1 = f1_score(true, pred, average='weighted')
  print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred,)}")
  print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
  print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

In [13]:
#predict on test split
y_pred_lr = pipe.predict(X_test)

In [14]:
#get the metrics
models_metrics(y_test,y_pred_lr)

CONFUSION MATRIX:
[[93 31 21 10  4  2]
 [54 60 30  6  6  8]
 [12 38 64 17  9 20]
 [ 6  6 15 66 27 24]
 [ 4  4 10 37 73 45]
 [ 7  8  8 19 24 92]]
ACCURACY SCORE:
0.4667
CLASSIFICATION REPORT:
	Precision: 0.4656
	Recall: 0.4667
	F1_Score: 0.4640


Now, we are going to try different tokenization method, with different models

In [15]:
import nltk

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Import from NLTK

In [17]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import re

French language

In [18]:
french_stopwords = set(stopwords.words('french'))
filtre_stopfr =  lambda text: [token for token in text if token.lower() not in french_stopwords]

In [19]:
filtre_stopfr( word_tokenize(X.loc[0], language="french") )

['coûts',
 'kilométriques',
 'réels',
 'peuvent',
 'diverger',
 'sensiblement',
 'valeurs',
 'moyennes',
 'fonction',
 'moyen',
 'transport',
 'utilisé',
 ',',
 'taux',
 "d'occupation",
 'taux',
 'remplissage',
 ',',
 "l'infrastructure",
 'utilisée',
 ',',
 'topographie',
 'lignes',
 ',',
 'flux',
 'trafic',
 ',',
 'etc',
 '.']

In [20]:
sp_pattern = re.compile( """[\.\!\"\s\?\-\,\']+""", re.M).split
sp_pattern(X.loc[25])

['Qu',
 'est',
 'ce',
 'qui',
 'peut',
 'bien',
 'poser',
 'problème',
 'dans',
 'ces',
 'petits',
 'livrets',
 'pourtant',
 'plutôt',
 'attractifs',
 'pour',
 'que',
 'seuls',
 '23',
 '4',
 '%',
 'de',
 'leurs',
 'détenteurs',
 'arrivent',
 'au',
 'bout',
 '']

In [58]:
def last_test (data):
  data = re.sub(" \d+", " ", data)
  data = sp_pattern(data)
  return data
last_test(X.loc[25])

['Qu',
 'est',
 'ce',
 'qui',
 'peut',
 'bien',
 'poser',
 'problème',
 'dans',
 'ces',
 'petits',
 'livrets',
 'pourtant',
 'plutôt',
 'attractifs',
 'pour',
 'que',
 'seuls',
 '%',
 'de',
 'leurs',
 'détenteurs',
 'arrivent',
 'au',
 'bout',
 '']

In [42]:
def clean_data (data): #stopwords and re.compile combined, less effective with stopwords
  cleaned_data = sp_pattern (data)
  cleaned_data = filtre_stopfr(cleaned_data)
  return cleaned_data

In [23]:
clean_data(X.loc[0])

['coûts',
 'kilométriques',
 'réels',
 'peuvent',
 'diverger',
 'sensiblement',
 'valeurs',
 'moyennes',
 'fonction',
 'moyen',
 'transport',
 'utilisé',
 'taux',
 'occupation',
 'taux',
 'remplissage',
 'infrastructure',
 'utilisée',
 'topographie',
 'lignes',
 'flux',
 'trafic',
 'etc',
 '']

We're going to try this tokenization method with the different models

In [259]:
tfidf = TfidfVectorizer(ngram_range=(1,1))

In [83]:
tfidf_vector = TfidfVectorizer(tokenizer= sp_pattern, ngram_range=(1,1))

In [24]:
from sklearn.ensemble import RandomForestClassifier
Randfor=RandomForestClassifier(n_estimators=500)

In [25]:
# Create pipeline
pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', Randfor)])

# Fit model on training set
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<built-in method split of re.Pattern object at 0x7f36d6970d30>)),
                ('classifier', RandomForestClassifier(n_estimators=500))])

In [26]:
y_pred_randfor=pipe.predict(X_test)

In [27]:
#function to calculate metrics of the models
def models_metrics (true, pred):
  precision = precision_score(true, pred, average='weighted')
  recall = recall_score(true, pred, average='weighted')
  f1 = f1_score(true, pred, average='weighted')
  print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred,)}")
  print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
  print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

In [28]:
models_metrics(y_test, y_pred_randfor)

CONFUSION MATRIX:
[[122  17  14   7   1   0]
 [ 68  57  27   8   3   1]
 [ 24  35  65  26   6   4]
 [  8  11  20  65  29  11]
 [ 16   8  22  53  46  28]
 [ 12  10  12  26  26  72]]
ACCURACY SCORE:
0.4448
CLASSIFICATION REPORT:
	Precision: 0.4497
	Recall: 0.4448
	F1_Score: 0.4359


Now with the Logistic Regression

In [29]:
pipe_lr = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', LR)])

# Fit model on training set
pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<built-in method split of re.Pattern object at 0x7f36d6970d30>)),
                ('classifier',
                 LogisticRegression(max_iter=1000, solver='newton-cg',
                                    tol=0.001))])

In [30]:
y_pred_lr2=pipe_lr.predict(X_test)

In [64]:
models_metrics(y_test,y_pred_lr2)

CONFUSION MATRIX:
[[101  32  15   7   3   3]
 [ 55  60  32   5   7   5]
 [ 10  39  71  16   9  15]
 [  7   5  17  69  26  20]
 [  3   3  13  37  70  47]
 [  5   3   9  19  20 102]]
ACCURACY SCORE:
0.4927
CLASSIFICATION REPORT:
	Precision: 0.4897
	Recall: 0.4927
	F1_Score: 0.4882


##Now with the SVC classifier

In [261]:
from sklearn.svm import SVC
svc = SVC(gamma="scale", random_state=42,kernel = 'poly', coef0 = 0.5)

In [264]:
pipe_svc = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', svc)])

# Fit model on training set
pipe_svc.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<built-in method split of re.Pattern object at 0x7f36d6970d30>)),
                ('classifier', SVC(coef0=0.5, kernel='poly', random_state=42))])

In [265]:
y_pred_svc=pipe_svc.predict(X_test)

In [266]:
models_metrics(y_test,y_pred_svc)

CONFUSION MATRIX:
[[99 32 19  6  3  2]
 [43 71 32  7  6  5]
 [ 6 38 79 17  9 11]
 [ 3  9 18 70 24 20]
 [ 2  2 13 45 65 46]
 [ 3  7  9 19 21 99]]
ACCURACY SCORE:
0.5031
CLASSIFICATION REPORT:
	Precision: 0.5047
	Recall: 0.5031
	F1_Score: 0.5012


#<font color = 'red'>Code to submit

In [270]:
df_pred_submit_svc = df_pred.copy()

In [271]:
unlabelled_svc=pipe_svc.predict(df_pred_submit_svc['sentence'])
unlabelled_svc

array(['C2', 'B1', 'A1', ..., 'C2', 'B2', 'B1'], dtype=object)

In [272]:
df_pred_submit_svc['difficulty']=unlabelled_svc #to modify when using another classification model
df_pred_submit_svc =df_pred_submit_svc.drop('sentence',axis=1)

In [273]:
#for the submission part: 
from google.colab import files
df_pred_submit_svc.to_csv('teamzoom_submit_svc.csv', encoding ='utf-8-sig',index=False) 
files.download('teamzoom_submit_svc.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>