In [4]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from assnat.clean import complete_preproc
from assnat.models import simple_logistic_regression
from assnat.params import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle as pk

def simple_logistic_regression(legislation_number, merge_data_sets=True, drop_col = DROP_COLS, drop_fam = DROP_FAM, na_col = NA_COLS , drop_names = DROP_NAMES , min_words= MIN_WORDS, punct_opt=PUNCT_OPT):
    if merge_data_sets:
        data1 = pd.read_csv('data/leg15.csv')
        data2 = pd.read_csv('data/leg16.csv')
        df = pd.concat([data1, data2], ignore_index=True, axis=0)
    else:
        df = pd.read_csv(f'data/leg{legislation_number}.csv')

    df = complete_preproc(df, drop_col= drop_col, drop_fam= drop_fam, na_col= na_col, drop_names= drop_names, min_words = min_words, punct_opt= punct_opt)

    X_train, X_test, y_train, y_test = train_test_split(
        df[['Texte', 'Thème Séance']],
        df['famille'],
        test_size=0.2,
        random_state=42
    )

    tfidf_vectorizer = TfidfVectorizer(max_features=100000)

    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf_texte', tfidf_vectorizer, 'Texte'),
            ('tfidf_theme', tfidf_vectorizer, 'Thème Séance')
        ],
        remainder='passthrough'
    )

    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(multi_class='multinomial', max_iter=100000))
    ])

    model_pipeline.fit(X_train, y_train)

    y_pred = model_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))
    pk.dump(model_pipeline, open('/Users/aaronviviani/code/aaronviviani/08\ -\ Project ', 'wb'))
    return accuracy

simple_logistic_regression(15, merge_data_sets= True, drop_fam= ['Variable'], na_col = ['Texte','famille','Nom Orateur'], drop_names = ['Mme la présidente', 'M. le président'], min_words = 75, punct_opt= True)


Columns dropped
family dropped
NaN dropped
Names dropped
Short sentences removed
Preprocessing done!




Accuracy: 0.4939667217071002
                precision    recall  f1-score   support

        Centre       0.52      0.81      0.63      6056
  Centre-droit       0.37      0.11      0.17      1452
 Centre-gauche       0.55      0.05      0.10       433
        Droite       0.47      0.37      0.41      2700
Extrême droite       0.52      0.17      0.25       486
Extrême gauche       0.50      0.39      0.44      1870
        Gauche       0.43      0.33      0.37      2749

      accuracy                           0.49     15746
     macro avg       0.48      0.32      0.34     15746
  weighted avg       0.48      0.49      0.46     15746



0.4939667217071002

In [3]:
simple_logistic_regression(15, merge_data_sets= True, drop_fam= ['Variable'], na_col = ['Texte','famille'], drop_names = ['Mme la présidente', 'M. le président'], min_words = 30, punct_opt= True)


Columns dropped
family dropped
NaN dropped
Names dropped
Short sentences removed
Preprocessing done!




Accuracy: 0.498352324611903
                precision    recall  f1-score   support

        Centre       0.53      0.81      0.64     20539
  Centre-droit       0.38      0.11      0.17      4589
 Centre-gauche       0.50      0.10      0.17      1567
        Droite       0.46      0.36      0.40      8665
Extrême droite       0.46      0.21      0.29      1483
Extrême gauche       0.48      0.38      0.42      5548
        Gauche       0.41      0.29      0.34      7983

      accuracy                           0.50     50374
     macro avg       0.46      0.32      0.35     50374
  weighted avg       0.48      0.50      0.46     50374



0.498352324611903

In [4]:
from transformers import AutoTokenizer, FlaubertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_cased")
model = FlaubertModel.from_pretrained("flaubert/flaubert_base_cased")

In [5]:
def simple_logistic_regression(legislation_number, merge_data_sets=True, drop_col = DROP_COLS, drop_fam = DROP_FAM, na_col = NA_COLS , drop_names = DROP_NAMES , min_words= MIN_WORDS, punct_opt=PUNCT_OPT):
    if merge_data_sets:
        data1 = pd.read_csv('data/leg15.csv')
        data2 = pd.read_csv('data/leg16.csv')
        df = pd.concat([data1, data2], ignore_index=True, axis=0)
    else:
        df = pd.read_csv(f'data/leg{legislation_number}.csv')

    df = complete_preproc(df, drop_col= drop_col, drop_fam= drop_fam, na_col= na_col, drop_names= drop_names, min_words = min_words, punct_opt= punct_opt)

    X_train, X_test, y_train, y_test = train_test_split(
        df['Texte'],
        df['famille'],
        test_size=0.2,
        random_state=42
    )

    tfidf_vectorizer = TfidfVectorizer(max_features=100000)

    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf_texte', tfidf_vectorizer, 'Texte'),
            ('tfidf_theme', tfidf_vectorizer, 'Thème Séance')
        ],
        remainder='passthrough'
    )

    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(multi_class='multinomial', max_iter=100000))
    ])

    model_pipeline.fit(X_train, y_train)

    y_pred = model_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))
    pk.dump(model_pipeline, open('/Users/aaronviviani/code/aaronviviani/08\ -\ Project ', 'wb'))
    return df, accuracy

In [6]:
simple_logistic_regression(15, merge_data_sets= True, drop_fam= ['Variable'], na_col = ['Texte','famille'], drop_names = ['Mme la présidente', 'M. le président'], min_words = 20, punct_opt= True)

Columns dropped
family dropped
NaN dropped
Names dropped
Short sentences removed
Preprocessing done!


IndexError: tuple index out of range

In [7]:
!pip install transformers plotly==5.8.0 pyyaml==5.4.1 datasets pytorch-lightning > /dev/null 2>&1

In [8]:
from pprint import pprint
import functools

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig
from sklearn.metrics import confusion_matrix, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm

In [9]:
camembert = CamembertForMaskedLM.from_pretrained('camembert-base')

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

In [11]:
tokenizer('My tokenizers and model must match')

{'input_ids': [5, 4646, 1200, 6840, 23038, 10, 1168, 16320, 19046, 1136, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
data1 = pd.read_csv('data/leg15.csv')
data2 = pd.read_csv('data/leg16.csv')
df = pd.concat([data1, data2], ignore_index=True, axis=0)
df = complete_preproc(df, drop_fam= ['Variable'], na_col = ['Texte','famille'], drop_names = ['Mme la présidente', 'M. le président'], min_words = 20, punct_opt= True)

Columns dropped
family dropped
NaN dropped
Names dropped
Short sentences removed


KeyboardInterrupt: 

In [17]:
df['Texte'] = df['Texte'].apply(lambda x: x.split(' '))

AttributeError: 'float' object has no attribute 'split'

In [37]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y = le.fit_transform(df.famille)
X_train,X_test,Y_train,Y_test = train_test_split(list(df['Texte']),y,test_size=0.2,random_state=28)


In [None]:
import string
from tensorflow.keras.preprocessing.text import text_to_word_sequence

df['Texte'] = df['Texte'].values.astype('str')

ValueError: setting an array element with a sequence

In [None]:
train_input = tokenizer(X_train, tokenizer)
test_input = tokenizer(X_test, tokenizer)

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
tokenized_X['input_ids', ]

[[5,
  17,
  12,
  802,
  618,
  1307,
  19,
  4305,
  616,
  17035,
  14,
  97,
  2413,
  51,
  2750,
  28,
  304,
  6178,
  24,
  19,
  14725,
  10,
  1148,
  15,
  1733,
  25,
  124,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [5,
  24,
  78,
  3726,
  32,
  17,
  12,
  802,
  50,
  8737,
  36,
  942,
  12238,
  129,
  8714,
  6437,
  20373,
  13,
  6657,
  5787,
  31,
  33,
  217,
  3247,
  22,
  2585,
  32,
  44,
  584,
  12283,
  2669,
  693,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,

In [None]:
'''from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, Dropout, LSTM, SpatialDropout1D
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np




#X_train, X_test, y_train, y_test = train_test_split(tokenized_X, y, test_size=0.20)

embedding_dimension = 256
MAX_NB_WORDS = 512

model = Sequential()
model.add(Embedding(10,output_dim=64, input_length=105))
#model.add(SpatialDropout1D(.2))
model.add(LSTM(64, dropout=.2, recurrent_dropout=.2))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.0001)
history = model.fit(np.array(tokenized_X['input_ids']), y[:3], epochs=100, batch_size=64,  callbacks=es)
'''

Epoch 1/100


InvalidArgumentError: Graph execution error:

Detected at node 'sequential_20/embedding_15/embedding_lookup' defined at (most recent call last):
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/traitlets/config/application.py", line 982, in launch_instance
      app.start()
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
      self._run_once()
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
      handle._run()
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/mp/99nc3g2s3t192ft55tyrlzdh0000gn/T/ipykernel_90367/2582761964.py", line 29, in <module>
      history = model.fit(np.array(tokenized_X['input_ids']), y[:3], epochs=100, batch_size=64,  callbacks=es)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/sequential.py", line 410, in call
      return super().call(inputs, training=training, mask=mask)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/Users/aaronviviani/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/keras/layers/core/embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'sequential_20/embedding_15/embedding_lookup'
indices[0,1] = 63 is not in [0, 10)
	 [[{{node sequential_20/embedding_15/embedding_lookup}}]] [Op:__inference_train_function_42696]

In [None]:
import fasttext


In [4]:
simple_logistic_regression(16, merge_data_sets= True, simplify_family= True, drop_fam= ['Variable'], na_col=['Texte', 'famille', 'Nom Orateur'], min_words= 20)

TypeError: simple_logistic_regression() got an unexpected keyword argument 'simplify_family'

In [None]:
simple_logistic_regression(16, merge_data_sets= True, simplify_family= True, drop_fam= ['Variable'], na_col=['Texte', 'famille', 'Nom Orateur'], min_words= 35)

Columns dropped
family dropped
NaN dropped
Names dropped
Short sentences removed
Preprocessing done!




Accuracy: 0.6348522435478957
              precision    recall  f1-score   support

      Centre       0.67      0.83      0.74     24172
      Droite       0.52      0.29      0.38      9241
      Gauche       0.59      0.50      0.54     12541

    accuracy                           0.63     45954
   macro avg       0.59      0.54      0.55     45954
weighted avg       0.62      0.63      0.61     45954



FileNotFoundError: [Errno 2] No such file or directory: '/Users/aaronviviani/code/aaronviviani/08/Project'

In [1]:
simple_logistic_regression(16, merge_data_sets= True, simplify_fam= True, drop_fam= ['Variable'], na_col=['Texte', 'famille', 'Nom Orateur'], min_words= 50)

NameError: name 'simple_logistic_regression' is not defined

In [10]:
import pickle

In [11]:
model = pickle.load(open('modelml.pkl', 'rb'))

In [None]:
model.evaluate(X_test,Y_test)

AttributeError: 'Pipeline' object has no attribute 'evaluate'

In [5]:
simple_logistic_regression(16, merge_data_sets= True, simplify_fam= True, drop_fam= ['Variable'], na_col=['Texte', 'famille', 'Nom Orateur'], min_words= 60)

Columns dropped
families simplified
family dropped
NaN dropped
Names dropped
Short sentences removed
Preprocessing done!




Accuracy: 0.6352926739477236
              precision    recall  f1-score   support

      Centre       0.67      0.83      0.74     12762
      Droite       0.52      0.30      0.38      4878
      Gauche       0.60      0.52      0.56      6807

    accuracy                           0.64     24447
   macro avg       0.60      0.55      0.56     24447
weighted avg       0.62      0.64      0.62     24447



0.6352926739477236

In [167]:
X_test = pd.DataFrame({
    'Texte': [" pauvres "],
    'Thème Séance': ["Pouvoir d'achat"]
})


In [168]:
complete_preproc(X_test)

Columns dropped
family dropped
NaN dropped
Names dropped
Short sentences removed
Preprocessing done!


Unnamed: 0,Texte,Thème Séance


In [172]:
model.predict_proba(X_test)

array([[0.14693605, 0.01542942, 0.83763453]])

In [173]:
model.predict(X_test)

array(['Gauche'], dtype=object)