In [239]:
import pandas as pd

dataset_path = {'pent':'ppentreprise_data.csv', 'pproduit':'presentation_produit_data.csv', 'oemploie':'offre_emploie_data.csv', 'pmetier':'presentation_metier_data.csv', 'temoignage':'temoignage_clients_data.csv', 'tuto':'tuto_data.csv'}

dfs = {}

for key, path in dataset_path.items():
    print(f'_______________________ Analyzing dataset {path} ___________________________________')
    dfs[key] = pd.read_csv('./cleaned_dataset/' + path, delimiter=';')
    print(dfs[key].head())

# we are going to concatenate all the data 
concat_df = pd.concat([dfs['pent'], dfs['pproduit'],  dfs['pmetier'], dfs['temoignage'], dfs['tuto'], dfs['oemploie']], axis=0, sort=True)


_______________________ Analyzing dataset ppentreprise_data.csv ___________________________________
   Unnamed: 0         link                                              title  \
0           0    PAeQD_64x                   Présentation Entreprise Rousseau   
1           1   QfCHZX1PnP              Présentation de l'entreprise Boréalie   
2           2  MWsdG52R9uQ   Brasserie Lambelin: Présentation de l'entreprise   
3           3   D264CLOFKE                   Présentation de l'entreprise DBC   
4           4  gVmZc9hx7X8  Film institutionnel de présentation du groupe ...   

                                         description                 category  \
0  Fondée en 1976, Rousseau est une entreprise de...  présentation entreprise   
1  Présentation de l'entreprise Boréalie cosmétiq...  présentation entreprise   
2  Présentation de "Brasserie Lambelin" à l'occas...  présentation entreprise   
3  Visite de la société DBC, spécialiste de l'usi...  présentation entreprise   
4  Décou

In [240]:
concat_df.shape


(2218, 8)

In [241]:
concat_df = concat_df[concat_df['transcriptions'] != ' ']
concat_df.shape

(1599, 8)

In [242]:
x_train = concat_df['transcriptions'].values

In [243]:
concat_df['category'].value_counts()

presentation metier        619
temoignage                 280
offre emploie              223
tutoriel                   183
présentation entreprise    150
presentation produit       144
Name: category, dtype: int64

In [244]:
concat_df['category'].describe()


count                    1599
unique                      6
top       presentation metier
freq                      619
Name: category, dtype: object

In [260]:
concat_df.to_csv('cleaned_dataset/easyMovieCategory_dataset.csv', sep=';', header=True,  mode='w')

We are going to train a TF-IDF+LinearSVC model on it. But first we need to clean the data.
We use the previously build pipline to clean and train the model. 

The dataset is not well balanced.

# Clean the data 

The first step we are going to use spacy to clean the data by :

* lower all the word to avoid that word like : Hello HELLO or hello are 
considered as 3 differents words.
* removing stop words 
* removing punctuation 
* lemming 


Note : In the second part we will do the same without lemming to see the performance.

In [245]:
import spacy
from spacy.lang.fr import French
spacy.cli.download("fr_core_news_sm")

spacy_nlp = spacy.load('fr_core_news_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [246]:
from spacy.lang.fr.stop_words import STOP_WORDS
stop_words = STOP_WORDS
punctuations = spacy.lang.punctuation.PUNCT

In [247]:
# create a copy of the dataset 
x = concat_df['transcriptions'].values
target = concat_df['category'].values

In [248]:
import numpy as np

category = np.unique(concat_df['category'].values)

In [249]:
from sklearn.preprocessing import LabelEncoder

encodage = LabelEncoder()
target = encodage.fit_transform(target)

In [250]:
category = encodage.fit_transform(category)

In [251]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, target, random_state=42, test_size=0.3, shuffle=True)

We will implement the same function but without lemming. We could add a boolean variable to indicate if we do lemming or not but for performance reason I prefer to implement an other function.

Let's test this one 


In [252]:
x_train.shape


(1119,)

In [253]:
from sklearn.svm import LinearSVC

In [254]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline

linear_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', LinearSVC())])

In [255]:
linear_clf.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [256]:
predicted = linear_clf.predict(x_test)

In [257]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(y_test, predicted, average='weighted')

(0.8432075345619748, 0.84375, 0.840136126556823, None)

In [258]:
from sklearn.metrics import classification_report

score = classification_report(y_pred=predicted, y_true=y_test, labels=category)

In [259]:
print(score)

              precision    recall  f1-score   support

           0       0.96      0.77      0.86        62
           1       0.85      0.95      0.90       186
           2       0.70      0.60      0.64        47
           3       0.76      0.63      0.69        41
           4       0.87      0.87      0.87        76
           5       0.82      0.90      0.86        68

   micro avg       0.84      0.84      0.84       480
   macro avg       0.83      0.79      0.80       480
weighted avg       0.84      0.84      0.84       480

