In [3]:
!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git &> /dev/null

# Imports

In [4]:
import os
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk

# sklearn
from sklearn.utils import shuffle
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

In [5]:
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

In [6]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Custom classes

In [7]:
#Path : '../dictionnaire/dictionnaire.txt'
class CamembertInputProcessor():
    def __init__(self,path:str):
        self.mots = set(line.strip() for line in open(path))
        self.lemmatizer = FrenchLefffLemmatizer()
        self.french_stopwords = nltk.corpus.stopwords.words('french')

    def call(self,inputs,labels=None):
        df_pre_proc = self.French_Preprocess_listofSentence(inputs['Caption'])
        if labels is not None:
            return pd.concat([df_pre_proc,labels],axis=1).drop(columns='Id')
        else:
            return df_pre_proc

    def French_Preprocess_listofSentence(self, listofSentence):
        preprocess_list = []
        for sentence in listofSentence:
            sentence_w_punct = "".join([i.lower() for i in sentence if i not in string.punctuation])
            sentence_w_num = ''.join(i for i in sentence_w_punct if not i.isdigit())
            tokenize_sentence = nltk.tokenize.word_tokenize(sentence_w_num)
            words_w_stopwords = [i for i in tokenize_sentence if i not in self.french_stopwords]
            words_lemmatize = (self.lemmatizer.lemmatize(w) for w in words_w_stopwords)
            sentence_clean = ' '.join(w for w in words_lemmatize if w.lower() in self.mots or not w.isalpha())
            preprocess_list.append(sentence_clean)

        df_test = pd.DataFrame(preprocess_list,columns = {'text'})
        df_test.index.rename('id',inplace=True)
        return df_test

# Load csv

In [8]:
X_test = pd.read_csv('/kaggle/input/mlftc/X_test.csv', sep=';')
X_train = pd.read_csv('/kaggle/input/mlftc/X_train.csv', sep=';')
X_nonlabeled = pd.read_csv('/kaggle/input/mlftc/nonlabeled_data.csv', sep=';')
y_train = pd.read_csv('/kaggle/input/mlftc/y_train.csv', sep=';')

# Text preproc

In [9]:
path = '/kaggle/input/dict-fr/dictionnaire.txt'
CIP = CamembertInputProcessor(path)
df = CIP.call(X_train, y_train)
df.head(3)

Unnamed: 0,text,category_1,category_2,category_3,category_4
0,mourir avant plus revoir petit enfant,1,1,0,0
1,maladie autres conséquence jeune génération vi...,0,1,0,0
2,comment vont sen sortir ceux mal tout logées,0,1,0,0


In [13]:
tmp = CIP.call(X_test)
tmp.head(10)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
0,quand tout cela va terminer
1,futur demeure incertain puisse plus faire projet
2,plus pouvoir vivre comme auparavant
3,proche
4,honte pays plus riche monde niveau pays sou dé...
5,directement peur mourir plus réanimation enfan...
6,jamais voir fin cette pandémie
7,fin crise cela va donner a population alors ri...
8,attraper sans men douter malgré masque personn...
9,contracter maladie


# Local train test split

In [11]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(
    df['text'], df[df.columns[1:]], test_size=0.3, random_state=42)

# TF-IDF

In [12]:
X_test

Unnamed: 0,Id,Caption
0,599,quand tout cela va t il se terminer ?
1,600,Que le futur demeure incertain... qu'on ne pui...
2,602,De ne plus pouvoir vivre comme auparavant.
3,603,pour mes proches
4,604,la honte !!! d'être un des pays les plus rich...
...,...,...
152,798,"- De rester enfermée,\n - loin des amies ou a..."
153,799,"L'atteinte à nos libertés (appli stop-covi, et..."
154,800,risque mortel
155,801,ne pas pouvoir sortir\n ne pas voir mes petits...


In [15]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train_)
X_test_counts = count_vect.transform(X_test_)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

X_final_counts = count_vect.transform(tmp['text'])
X_final_tfidf = tfidf_transformer.transform(X_final_counts)

# Models and training

In [16]:
def get_fit_model(clf, X, y):
    multi_target_clf = MultiOutputClassifier(clf, n_jobs=2)
    multi_target_clf.fit(X, y)
    return multi_target_clf

def get_probas(multi_target_clf, X):
    probas = multi_target_clf.predict_proba(X)
    probas_ = [p[:, 1] for p in probas]
    probas__ = np.zeros((len(probas_[0]), 4))
    for i in range(4):
        probas__[:, i] = probas_[i]
    
    return probas__

def score(y_true, probas):
    return roc_auc_score(y_true, probas)

In [17]:
# random forest
clf = RandomForestClassifier(random_state=1)
multi_target_clf = get_fit_model(clf, X_train_tfidf, y_train_)
probas = get_probas(multi_target_clf, X_test_tfidf)
print("score:", score(y_test_, probas))

score: 0.8229500105694959


In [54]:
# naive bayes
clf = MultinomialNB()
multi_target_clf = get_fit_model(clf, X_train_tfidf, y_train_)
probas = get_probas(multi_target_clf, X_test_tfidf)
print("score:", score(y_test_, probas))

score: 0.8268481688058895


In [41]:
# SVM
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto', probability=True))
multi_target_clf = get_fit_model(clf, X_train_tfidf, y_train_)
probas = get_probas(multi_target_clf, X_test_tfidf)
print("score:", score(y_test_, probas))

score: 0.6956515320210173


In [56]:
# LR
clf = LogisticRegression(random_state=0)
multi_target_clf = get_fit_model(clf, X_train_tfidf, y_train_)
probas = get_probas(multi_target_clf, X_test_tfidf)
print("score:", score(y_test_, probas))

score: 0.8323896186396187


In [74]:
# KNN
clf = KNeighborsClassifier(n_neighbors=20)
multi_target_clf = get_fit_model(clf, X_train_tfidf, y_train_)
probas = get_probas(multi_target_clf, X_test_tfidf)
print("score:", score(y_test_, probas))

score: 0.7902128576990709


In [22]:
# SGD
clf = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss='log', max_iter=1000, tol=1e-3))
multi_target_clf = get_fit_model(clf, X_train_tfidf, y_train_)
probas = get_probas(multi_target_clf, X_test_tfidf)
print("score:", score(y_test_, probas))

score: 0.6593467773614832


In [59]:
def probas_to_submission(probas, title="test.csv"):
    z = np.zeros((len(X_test), 5))
    z[:, 0] = X_test['Id ']
    z[:, 1:] = probas
    df = pd.DataFrame(z)
    df[0] = df[0].astype(int)
    df.to_csv(title, header=['Id','category_1','category_2','category_3','category_4'], index=None)


probas = get_probas(multi_target_clf, X_final_tfidf)
probas_to_submission(probas, 'logistic_reg.csv')

# Other

In [26]:
z = np.ones((len(X_test_), 4), dtype='int')
roc_auc_score(y_test_, z)

0.5