In [1]:
# This notebook demonstrates how to train a word2vec model and get similar words for a list of seed words
# Install necessary Python libraries
!pip install nltk gensim pandas



In [2]:
import nltk, string
# Download NTLK components
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\018850882\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\018850882\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\018850882\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\018850882\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\018850882\AppData\Roaming\nltk_data.

True

In [3]:
# Import nltk objects necessary for pre-processing
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk import sent_tokenize, word_tokenize
wnl = WordNetLemmatizer()

# Get NLTK stopwords
stop_words = set(stopwords.words('english'))
# Generate a set of punctuations, excluding "-" and "_"
def gen_puncs():
    puncs = string.punctuation
    puncs = puncs.replace("-", "")
    puncs = puncs.replace("_", "")
    puncs_set = set(puncs)
    return puncs_set
puncs_set = gen_puncs()

In [4]:
# Import the phraser model and word2vec model
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import pandas as pd

In [5]:
# Define a function to tokenize a doc into sentences and lemmatize them
# Return a list of tokenized sentences, where each sentence is a list of lemmatized tokens
def get_tokenized_lemmatized_sents(text):
    # Define a function to lemmatize tokens
    def lemm_tokens_nltk(tokenized_sents):
        # Get the part of speech of a word
        def get_wordnet_pos(treebank_tag):
            if treebank_tag.startswith('J'):
                return wordnet.ADJ
            elif treebank_tag.startswith('V'):
                return wordnet.VERB
            elif treebank_tag.startswith('N'):
                return wordnet.NOUN
            elif treebank_tag.startswith('R'):
                return wordnet.ADV
            else:
                return wordnet.NOUN
    
        tokenized_lemm_sents = []
        for tokenized_sent in tokenized_sents:
            word_pos = pos_tag(tokenized_sent)
            lemm_words = [wnl.lemmatize(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]
            tokenized_lemm_sents.append(lemm_words)
        return tokenized_lemm_sents
    
    sents = sent_tokenize(text)
    tokenized_sents = [word_tokenize(s) for s in sents]
    # To lower cases so that proper nouns are handled properly
    tokenized_sents = [[t.lower() for t in s] for s in tokenized_sents] 
    lemm_sents = lemm_tokens_nltk(tokenized_sents)
    return lemm_sents

In [6]:
# Define a function to convert all docs into a list of tokenized sentences, where each sentence is a list of lemmatized tokens 
def get_unigram_sents(docs):
    tokenized_sents_unigram = []
    for doc in docs:
        lemm_sents = get_tokenized_lemmatized_sents(doc)
        tokenized_sents_unigram += lemm_sents
    return tokenized_sents_unigram    

In [7]:
# Define a function to identify and join two-word phrases by using the Bigram model
# Take a list of tokenized sentences and output a list of tokenized sentences where some words are now joined into phrases
def get_bigram_sents(tokenized_sents_unigram, bigram_model):
    tokenized_sents_bigram = bigram_model[tokenized_sents_unigram]
    return tokenized_sents_bigram 

In [8]:
# Define a function to identify and join three-word phrases by using the Trigram model
# Take a list of tokenized sentences with two-word phrases and output a list of tokenized sentences 
# where some words are now joined into three-word phrases
def get_trigram_sents(tokenized_sents_bigram, trigram_mod):
    # sent_tokens is a list of tokens, already tokenized
    tokenized_sents_with_trigrams = trigram_mod[tokenized_sents_bigram]
    return tokenized_sents_with_trigrams

In [9]:
# Define a function to screen words
def is_valid_word(w):
    if len(w)<2:  #Remove single-letter words
        return False
    if w in stop_words: # Remove stopword
        return False
    if any(map(str.isdigit, w)): # Remove words containing digits
        return False
    if any(char in puncs_set for char in w): # Remove words containing punctuations (excluding "_" and "-")
        return False
    return True

# Make sure that the ngram is a valid phrase
def is_valid_ngram(ngram):
    tokens=ngram.split("_")
    for t in tokens:
        if not is_valid_word(t): # dropped of a stopword
            return False
    return True

# Break invalid phrases, e.g., converting "the_people" to " the people"
def break_invalid_phrases(tokenized_sent):
    tokens = []
    for tok in tokenized_sent:
        if ("_" in tok) and (not is_valid_ngram(tok)):
            tokens += tok.split("_")
        else:
            tokens.append(tok)
    return tokens
    

In [10]:
# Define a function to generate the final output for training the word2vec model
# The output is a list of tokenized sentences, where the tokens are lemmatized and phrased, 
# and certain tokens (e.g., punctuations, numbers, and stop words) are removed    
def get_phrased_sents(docs, min_count=1, threshold=5):   
    tokenized_sents_unigram = get_unigram_sents(docs)
    # Train a bigram model
    bigram = Phrases(tokenized_sents_unigram, min_count=min_count, threshold=threshold)
    bigram_mod = Phraser(bigram) 
    # Generate tokenized sentences with bigrams
    tokenized_sents_bigram = get_bigram_sents(tokenized_sents_unigram, bigram_mod)
    # Feed tokenized sentences with bigrams to the phraser model to identify trigrams
    trigram = Phrases(tokenized_sents_bigram, min_count=min_count, threshold=threshold)
    trigram_mod = Phraser(trigram)
    tokenized_sents_trigram = get_trigram_sents(tokenized_sents_bigram, trigram_mod)
    # break invalid phrases
    tokenized_sents_trigram  = [break_invalid_phrases(s) for s in tokenized_sents_trigram]
    # Remove stopwords, numbers, etc., for faster training
    tokenized_sents_trigram  = [[t for t in s if is_valid_word(t)] for s in tokenized_sents_trigram ]
    return tokenized_sents_trigram

In [11]:
# Define a function to train a word2vec model
def train_w2v_model(sents, vector_size=100, window=5, min_count=5, workers=3, epochs=3):    
    w2v_model = Word2Vec(
        sents,
        vector_size = vector_size,
        window = window,
        min_count = min_count,
        workers = workers,
        epochs = epochs
    )
    return w2v_model

In [12]:
# Define a function to get the most similar words for a given word
# Return a dataframe for all words which have an absolute cosine similarity score indicated by min_score. 
# The columns of the dataframe are similar word ("SimWord"), its cosine similarity score with the lookup word ("SimScore"),
# and how many times the similar word appears in the corpus ("SimWordFreq").
def get_most_similar_words(w2v_model, lookup_word, min_score=0.5):
    voca_len = len(w2v_model.wv)
    # print(list(w2v_model.wv.key_to_index))
    all_similar_words = w2v_model.wv.most_similar(positive=[lookup_word], topn=voca_len)
    # Then screen the words based on their cosine similarity scores with the lookup word
    top_words = [[word, score, w2v_model.wv.get_vecattr(word,"count")] for word, score in all_similar_words if abs(score)>=min_score]
    df = pd.DataFrame(top_words, columns=["SimWord", "SimScore", "SimWordFreq"])
    return df

In [13]:
# A small sample of HC disclosures
docs = [
    "The sustainability of Abbotts business depends on attracting, engaging and developing talented people with diverse backgrounds who share Abbotts mission to help people live their healthiest possible lives. Abbott provides its employees opportunities to grow and develop their careers, market competitive compensation and benefit programs, and the satisfaction of being part of a global company dedicated to improving health in more than 160 countries. As of December 31, 2020, Abbott employed approximately 109,000 people, 70% of whom were employed outside of the U.S. Women represented 47% of Abbotts U.S. workforce, 45% of its global workforce, and 39% of its managers.The health, safety and wellness of its employees is an Abbott priority embedded at every level of its business.  Abbotts integrated Environmental, Health and Safety organization governs health, safety and wellness at Abbotts facilities. Abbott also maintains global policies and standards for managing employee health and safety. Abbott takes a holistic approach to employee well-being. Abbotts global wellness programs are designed to meet the unique needs of employees across businesses and geographies and offer a wide range of programs, including supporting the mental, financial and physical health of employees and their families. For example, for over 20 years, Abbott has annually offered Exercise Across Abbott, which is a four-week physical wellness program that encourages employees to team up with colleagues and track how many minutes they exercise each day. Over 22,000 Abbott employees across 72 countries took part in 2020. During the COVID-19 pandemic, Abbott has taken aggressive steps to limit exposure and enhance the safety of facilities for its employees, including implementing mandatory temperature screening and social distancing, providing and requiring the use of personal protective equipment, and at most U.S. facilities, onsite COVID-19 testing. Abbott has an integrated global talent management process that is designed to identify and assess talent across the organization and provide equal and consistent opportunities for employees to develop their skills. All levels of employees participate in Abbotts annual performance management process to create development plans that support their particular career objectives, and Abbott provides a broad range of training, mentoring and other development opportunities to help its employees meet these objectives. The board of directors conducts an annual Talent Management Review, focusing on development of talent, diversity, and succession planning for critical positions. Similar reviews take place at every level of Abbott to develop talent and diversity across the organization. Abbott is committed to developing a workplace that is inclusive for all. Abbott ties executive compensation to human capital management, including diversity outcomes, to sustain an inclusive culture and the fair and balanced treatment of Abbotts employees. Abbotts employee networks play an important role in building an inclusive culture across all Abbott operations.  A member of Abbotts senior management serves as a sponsor for each of these networks, helping to align their objectives with Abbotts business strategies. Abbott has ten such networks, which are: Advancing Professionals Network (supporting early career employees), Asian Leadership and Cultural Network, Black Business Network, Flex Network (employees with part-time and flexible schedules), LA VOICE Network (supporting Hispanic and Latino employees), People with Disabilities Network, PRIDE (supporting LGBTQ employees), Veterans Network, Women Leaders of Abbott, and Women in STEM. Abbott offers professional development programs, which provide recent college graduates the opportunity to rotate through different areas of Abbott, often with the chance to work outside their home country. In 2020, 52% of the participants were women. Also, Abbott hosts hundreds of college students for paid internships. In 2020, 55% of the U.S. interns were women and 39% were minorities. Further, Abbott has operated a STEM internship program for high school students in the U.S. since 2012. The programs objective is to increase the number of students pursuing STEM-related careers and contribute to a more diverse talent pipeline for Abbott. In 2020, 58% of the STEM interns were women and 71% were minorities. Abbott is committed to building, retaining, and motivating a diverse talent pipeline that can meet the current and future needs of its businesses. To that end, Abbott provides market competitive compensation, healthcare benefits, pension and/or retirement savings plans, and several programs to facilitate employees building an ownership stake in Abbott, including a global long-term incentive program for employees generally beginning at the manager level. Abbott also has procedures and processes focused on providing employees equitable compensation, regardless of race or gender or other personal characteristics.",
    "We have a global and varied workforce, with major employee centers in the U.S., Canada, U.K. and Romania. As of the end of 2020, we employed approximately 17,000 employees within our business globally with approximately 10,000 within our North America segment and 7,000 within our Europe segment. Of the 17,000 employees, approximately 660 of our employees are in our Global Business Centers based in Milwaukee, Wisconsin and Bucharest, Romania. As of the end of 2020, approximately 33% and 29% of our North America and Europe workforces, respectively, are represented by trade unions or councils, which are subject to collective bargaining agreements, which come due for renegotiation from time to time. The Company strives to be a provider of meaningful experiences for its employees and a safe and healthy workplace for all employees. We believe that building a strong and diverse workforce is a significant contributor to our success as a business and to deliver on our purpose, and that we value and respect our differences. We believe that diversity with inclusion is the key to collaboration and a winning team culture. A significant component of the revitalization plan announced in October 2019 was the launch of a refreshed purpose (uniting people to celebrate all life’s moments), ambition (first choice for our people, consumers and customers) and shared company values (the first of which is Putting People First), all designed with a purpose of shifting the culture of the organization to drive stronger employee engagement and business engagement. With the overarching goals described in the preceding paragraph as guides, the leadership team and the chief people and diversity officers for the North America and Europe business units are tasked with managing all employment-related matters including recruitment, retention, leadership and development, compensation planning, succession planning, performance management, and diversity and inclusion. The Compensation Human Resource Committee of the Board of Directors is responsible for establishing and reviewing the overall compensation philosophy of the Company and providing oversight on certain human capital matters, including the Company’s talent retention and development, leadership development, talent pipeline, programs and systems for performance management and diversity and inclusion initiatives. The Audit Committee oversees the Company’s risk management program to identify and mitigate potential risks, including human capital issues. The Board of Directors then receives regular reports and recommendations from management and the board committees to help guide the Company’s strategy on retaining and developing a diverse and talented workforce. In North America, we promote and maintain employee resource groups for a number of different communities in our employee population - by race/ethnicity, by gender, LGBTQ+, early professionals, young families, and veterans, amongst others. We encourage participation in these groups as we believe it provides an open forum for individual employees who may share similar concerns or experiences. We also promote and emphasize leadership and development opportunities for our employees which includes our First Choice Learning Center, in-person and online training programs, and experiential training opportunities to encourage and promote employee health and safety, assist in building core competencies, learning best practices and developing leadership capabilities. As we work to a more diverse workforce and management team, the Company has developed programs to encourage the recruitment, retention and training of diverse leaders and working to ensure we have a highly skilled and diverse workforce. We track and monitor our progress on metrics of gender and race, particularly in the U.S., though data on race is not tracked in all jurisdictions. We aim to ensure that our employees have a healthy and safe work environment. Our supply chain has adopted and implemented a framework we call world class supply chain 2.0 at many of our brewery and other locations. As part of that framework, the Company's environment, health and safety policy guides the Company’s efforts in maintaining safe and healthy workplaces where we take a proactive approach to the identification and control of environment, health and safety risks. We work to improve our Environment, Health and Safety (EHS) performance through methodologies that aim to prevent workplace injuries and illness, and reduce environmental impacts of our operations. Our safety focus was evident during our response to the coronavirus pandemic where we implemented additional health and safety measures in the breweries and our distribution centers, ensuring these federally designated essential operations could continue to operate and we could protect our employees. We enhanced our cleaning protocols at the majority of our facilities including enhanced sanitization, social distancing, temperature screenings, cloth facemasks and hand sanitizers, instituted a paid leave coronavirus policy and program, adopted a voluntary unpaid leave program, and expanded access to virtual healthcare, remote fitness and wellness support and to our employee assistance program.",
    "At RPM, we understand that our company is only as strong as the team behind it. With the consistent support and dedication of leadership at all levels, we foster an environment that supports our associates as individuals and helps them thrive. Incorporating sustainable best practices in professional development, benefits, health and safety, and community involvement ensures that we can continue to hire the best associates and retain them throughout the course of their careers. It is critical to our long-term success to develop our internal talent.   Our Global Organizational Leadership Development (GOLD) Team is charged with creating a leadership-led learning culture across RPM.  The GOLD Team has developed several training programs to support development which include Leaders of the Future, RPM University, Strategic Leader Staff Rides, and partnering with the Center for Creative Leadership.  Since the inception of these programs the Company has seen many participants advance their careers, and the retention of participants has been greater than 90%. Our leadership has long understood that to attract and retain top talent, and to share the benefits of a successful business, we must maintain a premium benefits program for its associates. For U.S. associates, we offer an attractive benefits package, including defined benefit pension plans, medical, telehealth, tuition reimbursement and an employer-matched 401(k).  We also offer an Employee Assistance Program (EAP) which focuses on behavioral health and also provides resources for financial and legal matters. Mental health support has been key to employees during the Covid pandemic.  Employees can get this support through the EAP as well as through telehealth and we have seen an increase in the use of such services. Similar ancillary benefits are offered to our Canadian associates, and employees of our other foreign subsidiaries receive benefits coverage, to the extent deemed appropriate, through plans that meet local requirements. At RPM, we have built our workforce, in part, through our commitment to create a diverse and inclusive culture. While there are many examples in our corporate practices, policies, and internal and external programs, we are particularly proud of our Tremco/WTI partnership with the Department of Corrections.  This program provides roof training to designated inmates while still incarcerated.  Upon release, WTI guarantees employment opportunities to qualified participants within a pay range of $16 to $23 per hour, plus benefits, depending on roof competency. We follow many best practices to ensure our associates come to work feeling empowered to safely do their jobs.   As part of our environmental management system, we continuously educate and train to institutionalize our health and safety values, set and monitor health and safety objectives, conduct regular risk assessments and process hazard and root cause analysis, and actively enforce accident prevention and reporting policies. As of May 31, 2021, we employed 15,490 persons, of whom approximately 648 were represented by unions under contracts which expire at varying times in the future.  We believe that all relations with employees and their unions are good.",
    "We believe that, beyond being essential to our operations, our people have inestimable worth independent of our business. As outlined in our Human Rights Policy (see, www.american-vanguard.com under ESG tab), we believe that it is fundamental to our corporate responsibility and, indeed, to our humanity, that we recognize, respect and nurture the freedom and dignity of all persons. Accordingly, we have insinuated that belief throughout the fabric of our operations in our approach toward our employees. Indeed, the first two core values underlying our commitment to sustainability (see, Update to Corporate Sustainability Report, www.american-vanguard.com under ESG tab) are “Safety First” – which is a culture that begins with highly-regulated manufacturing plants, continues into the design of science-backed products and extends into market-leading delivery systems – and “Making a Difference” – under which, by rewarding achievement and giving our employees a voice, we attract diverse employees who want to make a difference in their careers, in the company and in the communities that we serve. Our Human Capital program consists of the following elements: Board Oversight – through our Nominating and Corporate Governance Committee (“N&CG”), our board of directors oversees human capital-related risks and opportunities. At least annually, the N&CG Committee requires that management update succession planning for key executives, including with respect to planning for the future with a commitment toward diversity, equity and inclusion. Strategy – the Company’s human capital strategy has two primary elements: giving our employees a voice and providing them with generous benefits (including an unrivalled health benefits plan and awards of common stock to the entire workforce). As we have covered in our Update to Corporate Sustainability Report, our company is a destination for highly qualified employees who are drawn to a workplace where they can make a difference. Our managerial approach is that our functions work in a collaborative manner – cutting across departmental lines to arrive at better solutions with a high level of efficiency. This strategy has enabled the Company to maximize retention, even in an increasingly competitive employment market. Compensation – as mentioned in our Strategy above, compensation is an essential element of our human capital approach. During the pandemic in the midst of the so-called “Great Resignation” that affected many industries, we took measures to incentivize our workforce to remain with us, including across-the-board wage increases in certain of our manufacturing facilities. To the extent that our highly skilled personnel are being recruited by other companies, we endeavor to keep an open conversation on their needs and, where appropriate, have increased their total compensation (through a combination of wage, stock and/or vacation) to retain them. Voice – our management style is to solicit good ideas from employees, involve them in implementation and give them recognition for ideas that succeed. For example, personnel from virtually any department (be it sales, technology, product development or otherwise) can submit ideas to our Innovation Review Committee (“IRC”) for consideration and potential funding. The IRC continues to be a source of new product ideas that has enabled us to launch several new formulations and other solutions on an annual basis. Similarly, our Beekeeper platform is a company-only social media channel on which employees anywhere in the world can report on their accomplishments, commendations of others and local developments. Diversity, Equity and Inclusion (DEI) – the Company continues to expand its DEI program. In 2021, with the retirement of Lawrence Clark from our board of directors, the Company called upon the Latino Corporate Directors Association to help recruit Marisol Angelini as a new director. With Ms. Angelini’s addition to the board, three of nine members (33%) of our board are female and three of nine (again, 33%) are from underrepresented groups (LGBTQ, Middle Eastern and Latinx). Based upon the Company’s most current EEO-1 (“Equal Employment Opportunity”) Report, representation of African Americans in our domestic workforce exceeds the prevalence of that group in the national population, while representation of Hispanic personnel is slightly below the national average. Nevertheless, during 2022, the Company is working on a plan to advance its commitment to DEI throughout the workforce. The Company employed 804 employees as of December 31, 2021, and 771 employees as of December 31, 2020. From time to time, due to the seasonality of its business, AVD uses temporary contract personnel to perform certain duties primarily related to packaging of its products. None of the Company’s employees are subject to a collective bargaining agreement. The Company believes it maintains positive relations with its employees."
    ]

In [14]:
# Generate phrased and tokenized sentences
phrased_sents = get_phrased_sents(docs, min_count=2, threshold=10)
# Train a w2v model (using a small vector_size due to the small sample size)
w2v_mod = train_w2v_model(phrased_sents, vector_size=30, window=5, min_count=3, workers=5, epochs=5)

In [15]:
# Get the most similar words for each seed word
seed_words = ["employee", "diversity", "talent", "career", "gender"]
for seed_word in seed_words:
    df = get_most_similar_words(w2v_mod, seed_word, min_score=0.3)
    print(seed_word)
    print(df.head())

employee
        SimWord  SimScore  SimWordFreq
0         level  0.533215            6
1     diversity  0.472661            9
2  compensation  0.455126           10
3        global  0.423858            9
4     committee  0.415127            5
diversity
    SimWord  SimScore  SimWordFreq
0    people  0.589679            9
1    global  0.497653            9
2  employee  0.472661           52
3    report  0.462358            5
4  pandemic  0.405816            4
talent
             SimWord  SimScore  SimWordFreq
0      approximately  0.492184            6
1        competitive  0.490279            3
2              track  0.480771            3
3  inclusive_culture  0.391258            3
4             strong  0.315198            3
career
   SimWord  SimScore  SimWordFreq
0    first  0.532908            6
1    voice  0.400650            4
2   leader  0.379823            4
3   matter  0.377745            3
4  country  0.372866            3
gender
      SimWord  SimScore  SimWordFreq
0  leadersh