# Import and Package installation

In [1]:
import pandas as pd
import time
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM , TFAutoModelForTokenClassification


### Test if transformetrs is installed

In [None]:
!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

# Create function for feature engineering

In [3]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


### Function to get the name

In [4]:
def get_name(text: str, tokenizer, model) -> str:
    input_ids = tokenizer(f"question: What is my name ? context: {text}",padding="longest", return_tensors="tf", max_length=4096, truncation=True).input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

### Function to get the skills

In [5]:
def get_skills(text: str, tokenizer, model) -> str:
    input_ids = tokenizer(f"question: What are my skills ? context: {text}",padding="longest", return_tensors="tf", max_length=4096, truncation=True).input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

### Function to get the background

In [6]:
def get_experiences(text: str, tokenizer, model) -> str:
    input_ids = tokenizer(f"question: What are my experiences ? context: {text}",padding="longest", return_tensors="tf", max_length=4096, truncation=True).input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

### Function to get the job 

In [7]:
def get_job(text: str, tokenizer, model) -> str:
    input_ids = tokenizer(f"question: What is the job he wants to apply? context: {text}",padding="longest", return_tensors="tf", max_length=4096, truncation=True).input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Import my data

In [22]:
df = pd.read_csv("data/lettres_motivation.csv")
df.shape

(1599, 2)

We take only the "lettre" column

In [23]:
df_lettre = df["lettre"]

In [24]:
def get_features(df: pd.DataFrame)->list:
    print("Get names")
    name = df.apply(lambda row: get_name(row, tokenizer=tokenizer, model=model)).rename("name")
    print("Get skills")
    skills = df.apply(lambda row: get_skills(row, tokenizer=tokenizer, model=model)).rename("skills")
    print("get experiences")
    experiences = df.apply(lambda row: get_experiences(row, tokenizer=tokenizer, model=model)).rename("exp")
    print("get job")
    job = df.apply(lambda row: get_job(row, tokenizer=tokenizer, model=model)).rename("job")
    return pd.concat([name,skills,experiences,job], axis=1)


In [25]:
features = get_features(df_lettre)
display(features)

Get names




Get skills
get experiences


In [20]:
merged = features.merge(df_lettre, right_index=True, left_index=True)

# Save the new Dataframe to local as csv

In [21]:
merged.to_csv("data/LM_Dataset.csv", sep=';', index=False)