# Example of uses

First we import the module

In [1]:
print("loading the module")

import twitter_profile_predictor as tpp

loading the module


[nltk_data] Downloading package punkt to /home/tim0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tim0/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Method 1 : ``bios_analyser`` class

We can load a bios while intitializing the module ``bios_analyzer``, we then call the methods that applies directly on the loaded bios.

In [2]:
example_bios_1 = "je suis president du monde et journaliste chez sciencespo"

print('starting class bios analyzer...')
extractor = tpp.bios_analyzer(example_bios_1)

starting class bios analyzer...


In [3]:
# Example for full_tokenize function
print('tokenization')
extractor.full_tokenize()
print(extractor.full_tokens)

tokenization
['president', 'monde', 'journaliste', 'chez', 'sciencespo', ('president', 'monde'), ('monde', 'journaliste'), ('journaliste', 'chez'), ('chez', 'sciencespo')]


In [4]:
# For each get function we can get the estimation
print("profession extraction")

# directly as return of the function
print(extractor.get_professions())

# or as a class attribute
print(extractor.professions)

profession extraction
['journalist']
['journalist']


In [5]:
# For each get function we can get the estimation
print("statuses extraction")

# directly as return of the function
print(extractor.get_allstatuses())

# or as a class attribute
print(extractor.allstatuses)

statuses extraction
['sciencespo', 'president']
['sciencespo', 'president']


In [6]:
# For each get function we can get the estimation
print("language extraction")

# directly as return of the function
print(extractor.get_lang())

# or as a class attribute
print(extractor.language)

language extraction
fr
fr


## List of the methods

In [7]:
# tokenize method
tokens = extractor.tokenize()
print("tokens:", tokens)

# bi_tokenize method
bi_tokens = extractor.bi_tokenize()
print("bi_tokens:", bi_tokens)

# full_tokenize method
full_tokens = extractor.full_tokenize()
print("full_tokens:", full_tokens)

# get_actorstatuses method
actorstatuses = extractor.get_actorstatuses()
print("actorstatuses:", actorstatuses)

# get_ages method
ages = extractor.get_ages()
print("ages:", ages)

# get_allstatuses method
allstatuses = extractor.get_allstatuses()
print("allstatuses:", allstatuses)

# get_gender method
gender = extractor.get_gender()
print("gender:", gender)

# get_groupstatuses method
groupstatuses = extractor.get_groupstatuses()
print("groupstatuses:", groupstatuses)

# get_lang method
lang = extractor.get_lang()
print("lang:", lang)

# get_professions method
professions = extractor.get_professions()
print("professions:", professions)

# get_prostatus method
prostatus = extractor.get_prostatus()
print("prostatus:", prostatus)

# get_topics method
topics = extractor.get_topics()
print("topics:", topics)

# get_universitystatuses method
universitystatuses = extractor.get_universitystatuses()
print("universitystatuses:", universitystatuses)

tokens: ['president', 'monde', 'journaliste', 'chez', 'sciencespo']
bi_tokens: [('president', 'monde'), ('monde', 'journaliste'), ('journaliste', 'chez'), ('chez', 'sciencespo')]
full_tokens: ['president', 'monde', 'journaliste', 'chez', 'sciencespo', ('president', 'monde'), ('monde', 'journaliste'), ('journaliste', 'chez'), ('chez', 'sciencespo')]
actorstatuses: []
ages: []
allstatuses: ['sciencespo', 'president']
gender: ['Man']
groupstatuses: []
lang: fr
professions: ['journalist']
prostatus: ['president']
topics: []
universitystatuses: ['sciencespo']


## Method 2 : direct function

In this method we want to use the module on other bios and not on the one loaded in the module. We just need to add an argument `bios` to our methods.

In [8]:
example_bios_1 = "je suis president du monde et journaliste chez sciencespo"
example_bios_2 = "Software Developer at Microsoft, passionate about AI and cloud computing"

print('starting class')
extractor = tpp.bios_analyzer()

starting class


In [9]:
print(extractor.get_professions(bios=example_bios_1))
print(extractor.get_professions(bios=example_bios_2))

['journalist']
['developer']


## Method 3 : automatically process dataframe

We build a class ``df_bios_analyzer`` meant to process directly your dataframe.

In [10]:
import pandas as pd

# create a dataframe with sample data
df = pd.DataFrame({
    'id': [1, 2, 3],
    'description': [
        'je suis president du monde et journaliste chez sciencespo',
        'software engineer at Google, love to travel',
        'CEO at startup, passionate about AI and machine learning'
    ]
})
df

Unnamed: 0,id,description
0,1,je suis president du monde et journaliste chez...
1,2,"software engineer at Google, love to travel"
2,3,"CEO at startup, passionate about AI and machin..."


In [11]:
df_extractor = tpp.df_bios_analyzer(df=df, description_column='description')

In [12]:
df_extractor.full_tokenize()
df_extractor.df

Unnamed: 0,id,description,full_tokens
0,1,je suis president du monde et journaliste chez...,"[president, monde, journaliste, chez, sciences..."
1,2,"software engineer at Google, love to travel","[software, engineer, google, love, travel, (so..."
2,3,"CEO at startup, passionate about AI and machin...","[ceo, startup, passionate, machine, learning, ..."


In [13]:
df_extractor.get_professions(tokens_column='full_tokens')	
df_extractor.df

Unnamed: 0,id,description,full_tokens,professions
0,1,je suis president du monde et journaliste chez...,"[president, monde, journaliste, chez, sciences...",[journalist]
1,2,"software engineer at Google, love to travel","[software, engineer, google, love, travel, (so...","[engineer, developer]"
2,3,"CEO at startup, passionate about AI and machin...","[ceo, startup, passionate, machine, learning, ...",[ceo]


We can also apply all the columns together using get_all

In [14]:
df_extractor.get_all()

Unnamed: 0,id,description,full_tokens,professions,tokens,bi_tokens,prostatus,actorstatus,groupstatus,universitystatus,allstatus,age,gender,topic,lang
0,1,je suis president du monde et journaliste chez...,"[president, monde, journaliste, chez, sciences...",[journalist],"[president, monde, journaliste, chez, sciencespo]","[(president, monde), (monde, journaliste), (jo...",[president],[],[],[sciencespo],"[sciencespo, president]",[],[Man],[],fr
1,2,"software engineer at Google, love to travel","[software, engineer, google, love, travel, (so...","[engineer, developer]","[software, engineer, google, love, travel]","[(software, engineer), (engineer, google), (go...",[],[],[],[],[],[],[],[],en
2,3,"CEO at startup, passionate about AI and machin...","[ceo, startup, passionate, machine, learning, ...",[ceo],"[ceo, startup, passionate, machine, learning]","[(ceo, startup), (startup, passionate), (passi...",[],[startup],[],[],[startup],[],[],[entrepreneurship],en


# Get keywords

If you want to access the full list of keywords you can do this :

In [15]:
tpp.get_pro_kewords()

Unnamed: 0_level_0,Sum,Keywords,Group
Professions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
journalist,1362,"journaliste, reporter, journalist, pigiste, ex...","Information, arts and entertainment professions"
author,456,"auteur, créateur, writer, chroniqueur, author,...","Information, arts and entertainment professions"
professor,425,"prof, professeur, enseignant, formateur, profe...",Professors and higher scientific professions
city ​​councilor,335,"('conseiller', 'municipal'), ('conseillère', '...",Elected officers and political representatives
ceo,333,"ceo, dg, pdg, ('directeur', 'général'), ('chef...","Business, IT and administration professionals"
mayor,311,maire,Elected officers and political representatives
consultant,284,"consultant, consultante, consulting, ('cabinet...","Business, IT and administration professionals"
editor,226,"rédacteur, rédactrice, editor, éditeur, ('red'...","Information, arts and entertainment professions"
attorney,225,"avocat, barreau, avocate, lawyer, advocate",Legal professions
entrepreneur,220,"entrepreneur, entrepreneure","Business, IT and administration professionals"


In [16]:
tpp.get_status_kewords()

Unnamed: 0_level_0,Sum,Keywords,Type,Age
Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
former,904,"ex, ancien, ancienne, exjournaliste, exprésident",professional,
director,829,"directeur, directrice, director, head, directi...",professional,
advisor,734,"conseiller, conseillère, advisor, ('ancien', '...",professional,
president,695,"président, présidente, president, presidente",professional,
executive,539,"responsable, manager, cadre",professional,
activist,503,"militant, engagé, engagée, militante, engageme...",groupe,
founder,454,"fondateur, founder, cofondateur, cofounder, fo...",professional,
enthusiast,447,"passionné, passionnée, amateur, amatrice, enth...",professional,
member,405,"membre, member, adhérent",professional,
chief,370,"chef, cheffe",professional,


In [17]:
tpp.get_gender_kewords()

Unnamed: 0_level_0,Sum,Keywords
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Man,5514,"président, conseiller, directeur, chef, passio..."
Woman,2053,"conseillère, directrice, ingénieure, ingenieur..."


In [18]:
tpp.get_topic_kewords()

Unnamed: 0_level_0,Sum,Keywords
Topics,Unnamed: 1_level_1,Unnamed: 2_level_1
politics,1144,"politique, politiques, politics, policy, polit..."
digital,810,"digital, numérique, internet, informatique, di..."
communication,616,"communication, communications, compublique"
culture,616,"culture, cinéma, patrimoine, arts, théâtre, cu..."
public,464,"public, publique, publiques, servicepublic, af..."
innovation,451,"innovation, linnovation, ('digital', 'innovati..."
law,432,"droit, droits, juridique, ('droit', 'public')"
education,375,"formation, education, éducation, études"
technology,362,"tech, technologies, frenchtech, technology, fi..."
sciences,301,"sciences, science, scientifique, maths, mathém..."
