In [1]:
import json
import pandas as pd

from google.oauth2 import service_account

In [2]:
with open("ensai-2024-d39d18623847.json") as source:
  info = json.load(source)


  credentials = service_account.Credentials.from_service_account_info(info)

  prenoms = pd.read_gbq("ml.prenoms", project_id="ensai-2024", credentials=credentials)

In [3]:
prenoms.head()

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
0,1,_PRENOMS_RARES,1900,13,37
1,1,_PRENOMS_RARES,1900,57,37
2,1,_PRENOMS_RARES,1900,59,44
3,1,_PRENOMS_RARES,1900,62,34
4,1,_PRENOMS_RARES,1900,75,45


## Preprocessing
1. Exclure les "_PRENOMS_RARES"
2. Aggréger les "annais" et les "dpt" pour avoir la fréquence par preusuel
3. Déterminger le genre d'un prenom (optionnel)

Exclure les "_PRENOMS_RARES"

In [4]:
preprocessing = prenoms[prenoms["preusuel"] != '_PRENOMS_RARES']
preprocessing["nombre"] = preprocessing["nombre"].astype(int)
preprocessing = preprocessing.groupby(["preusuel", "sexe"]).agg({"nombre": sum}).reset_index()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessing["nombre"] = preprocessing["nombre"].astype(int)
  preprocessing = preprocessing.groupby(["preusuel", "sexe"]).agg({"nombre": sum}).reset_index()


In [5]:
preprocessing.head()

Unnamed: 0,preusuel,sexe,nombre
0,A,1,27
1,AADAM,1,32
2,AADEL,1,57
3,AADHIRA,2,22
4,AADIL,1,190


In [6]:


data = (
    preprocessing
        .iloc[preprocessing.groupby("preusuel")["nombre"].idxmax()]
        .drop(["nombre"], axis=1)
)



In [7]:
data.head()

Unnamed: 0,preusuel,sexe
0,A,1
1,AADAM,1
2,AADEL,1
3,AADHIRA,2
4,AADIL,1


Aggréger les "annais" et les "dpt" pour avoir la fréquence par preusuel

## Feature engineering
1. Encoder les prénoms dans un vecteur d'apparition des lettres (appelé X)
2. a->1, b->2, c->3 etc.
3. on fait quoi avec les - , et les accents ? et les majuscules ?

In [8]:
def encode_prenom(prenom: str) -> pd.Series:
    """
        This function encode a given name into a pd.Series.

        For instance alain is encoded [1, 0, 0, 0, 0 ... 1, 0 ...].
    """
    alphabet = "abcdefghijklmnopqrstuvwxyzé-'"
    prenom = prenom.lower()

    return pd.Series([letter in prenom for letter in alphabet]).astype(int)

In [9]:
X = data["preusuel"].apply(encode_prenom)
#encode_prenom("Christophe")

In [10]:
Y = (data["sexe"].astype(int) - 1)

In [11]:
# Exemple d'utilisation
prenom_exemple = "Émilie"
resultat = encode_prenom(prenom_exemple)
print(resultat)
print(X.shape)
print(Y.shape)

0     0
1     0
2     0
3     0
4     1
5     0
6     0
7     0
8     1
9     0
10    0
11    1
12    1
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    1
27    0
28    0
dtype: int64
(36170, 29)
(36170,)


In [12]:
X.shape, Y.shape

((36170, 29), (36170,))

## Modélisation

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
regr = LogisticRegression()

In [15]:
regr.fit(X, Y)


In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, Y)

In [17]:
!pip list | grep scikit


scikit-learn                             1.3.2


In [18]:
y_pred_RF = model.predict(X)

In [19]:
accuracy_score(Y, y_pred_RF)

0.8407520044235555

In [20]:
regr.predict([encode_prenom("Jean")])

array([1])

In [21]:
alphabet = "abcdefghijklmnopqrstuvwxyzé-'"
list(zip(list(alphabet), regr.coef_[0]))

[('a', 0.8511474092816597),
 ('b', -0.7700426999401857),
 ('c', -0.2941825168152412),
 ('d', -0.7088209902485293),
 ('e', 0.5599947834024898),
 ('f', -0.412718693845997),
 ('g', -0.4218996855569829),
 ('h', -0.2168070085028354),
 ('i', 0.46374933025979126),
 ('j', -0.610511366786139),
 ('k', -0.7162712264280716),
 ('l', 0.16318994219242136),
 ('m', -0.3794675513947608),
 ('n', -0.12607543685643177),
 ('o', -0.43977100305263794),
 ('p', -0.5258446036852028),
 ('q', -0.8886168381630548),
 ('r', -0.0962170976724953),
 ('s', -0.18865237711509275),
 ('t', -0.09351726500292913),
 ('u', -0.050409833494722285),
 ('v', -0.6852055195144322),
 ('w', -0.9654352365000729),
 ('x', -0.9311467899475074),
 ('y', 0.18153966421932777),
 ('z', -0.26195398191979474),
 ('é', 0.4861578013795151),
 ('-', -0.27078710324559363),
 ("'", -0.6963526445203498)]

In [22]:
y_pred = regr.predict(X)

In [23]:
accuracy_score(Y, y_pred)

0.6459773292784076

# Dump du modèle

In [24]:
import joblib

In [25]:
joblib.dump(regr, "model.v1.bin")

['model.v1.bin']

In [26]:
joblib.dump(model, "model.v2.bin")

['model.v2.bin']

## Reload du modèle (exemple pour l'API)

In [27]:
regr_loaded = joblib.load("model.v1.bin")

In [28]:
regr_loaded.predict([encode_prenom("Toto")])

array([0])

## Train
générer un fichier binaire .bin

## Faire un test qui sort un .metrics

## Partie HTTP