<a href="https://colab.research.google.com/github/Rishita32/RTX_Case_Study/blob/main/RTX_Case_Study_BERTopic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Packages

In [1]:
! pip install bertopic



In [2]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 600)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.6)
import spacy

from google.colab import drive

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load Dataset

In [3]:
drive.mount('/content/drive', force_remount=True)
df=pd.read_excel('drive/MyDrive/rtx_dataset.xlsx')

Mounted at /content/drive


In [4]:
nlp = spacy.load('en_core_web_sm')

# Load Sentence Transformer

In [5]:
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
data=list(df['Text Comment'])
data

['You are very much just a number at this company. It does not matter how long you have been there or what your skill base is, you are disposable. Management is very crooked and everyone is out for themselves.',
 'Promises, promises, promises that never came to fruition. Buyer beware if you have a book of business. They will bring you over, throw teams on your largest clients, and then try to smoke you out.',
 'The company is so focused on sales, that they tape scripts to your computer and you get reprimanded for not using them with every customer. That includes the daily regulars that are well aware of the company products available to them. Management is more sales focused, than proper management. Maintaining a balanced an efficient work environment is not a priority.',
 'Having an open office with out any sound barriers makes it difficult to focus, pay increases are non existent - you are lucky to get a % every year and the culture is generally passive aggressive. Mid - level leader

# Data Preprocessing

In [7]:
def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

df['clean_comment'] = df['Text Comment'].apply(preprocess_text)

In [8]:
comments=df['clean_comment'].to_list()
classes=df['Text Comment Sentiment'].to_list()

# Load BERTopic Model

In [9]:
topic_model = BERTopic(
    embedding_model=sentence_model,
    top_n_words=10,
    min_topic_size=5,
    n_gram_range=(1, 2)
    )

In [10]:
topics, probabilities = topic_model.fit_transform(comments)

In [11]:
# get_topics returns a dictionary with topic id and the topi words
topic_model.get_topics()

{-1: [('work', 0.003926849542720528),
  ('company', 0.003722196338739058),
  ('people', 0.0034620333270683093),
  ('good', 0.003389313505420446),
  ('great', 0.003288895869810239),
  ('opportunity', 0.003084697920853085),
  ('employee', 0.003079128598616184),
  ('culture', 0.0030497648561790144),
  ('management', 0.002976411458420122),
  ('lot', 0.0029445485217270504)],
 0: [('culture', 0.01703943522957279),
  ('company', 0.007539186327018542),
  ('amazing', 0.007422812268799951),
  ('product', 0.007380829737025279),
  ('company culture', 0.006871569348238532),
  ('work culture', 0.006779259025846712),
  ('customer', 0.006390643499797942),
  ('great', 0.005449825252823711),
  ('business unit', 0.005337700258263819),
  ('people', 0.005294229513008088)],
 1: [('promotion', 0.03170829774142554),
  ('raise', 0.014265017566813884),
  ('promote', 0.011353308327577683),
  ('promotion raise', 0.008354491958246937),
  ('appraisal', 0.007336780173225967),
  ('merit', 0.007251576049723527),
  ('l

Please Note: I tried evaluating the BERTopic model using coherence score, however ran into few issues! Apologies for that.

# Topic Visualization

In [12]:
# visualize topics per class: negative and positive sentiment
topics_per_class = topic_model.topics_per_class(comments,
    classes=classes)

fig=topic_model.visualize_topics_per_class(topics_per_class,
    top_n_topics=10, normalize_frequency = True)

fig.show()

In [13]:
fig = topic_model.visualize_topics()
fig.show()

In [14]:
# barchat to plot the key topics and their words in descending order of frequency
fig = topic_model.visualize_barchart()
fig.show()

In [15]:
#probability distribution of probabilities of each topics
fig = topic_model.visualize_distribution(probabilities)
fig.show()

#  Negative Comments Analysis

In [16]:
negative_comments = df[df['Text Comment Sentiment'] == 'Negative']['clean_comment'].tolist()

In [17]:
topic_model_neg = BERTopic(
    embedding_model=sentence_model,
    top_n_words=10,
    min_topic_size=5,
    n_gram_range=(1, 2)
    )

negative_topics, probabilities = topic_model_neg.fit_transform(negative_comments)

In [18]:
fig = topic_model_neg.visualize_barchart()
fig.show()

# Positive Comments

In [19]:
positive_comments = df[df['Text Comment Sentiment'] == 'Positive']['clean_comment'].tolist()

In [20]:
topic_model_pos = BERTopic(
    embedding_model=sentence_model,
    top_n_words=10,
    min_topic_size=5,
    n_gram_range=(1, 2)
)

positive_topics, probabilities = topic_model_pos.fit_transform(positive_comments)

In [21]:
fig = topic_model_pos.visualize_barchart()
fig.show()

# Summarize Topics- Negative Comments

In [22]:
negative_representatives = topic_model_neg.get_representative_docs()

In [23]:
from transformers import pipeline

summarizer = pipeline("summarization")

# Summarize the representative comments for each negative topic
negative_summaries = {}
for topic_id, comments in negative_representatives.items():
    comments_text = " ".join(comments) # collect comments with negative sentiment
    summary = summarizer(comments_text, max_length=150, min_length=30, do_sample=False)  # collect summaries of each topics derived from negative comments
    negative_summaries[topic_id] = summary[0]['summary_text']

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Your max_length is set to 150, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 150, but your input_length is only 81. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Your max_length is set to 150, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_len

In [24]:
# Create data frame to aggregate the keywords derived from negative comments, and summarize it

negative_topic_keywords = topic_model.get_topic_info()
negative_combined_summaries = []
for topic_id, summary in negative_summaries.items():
    keywords = negative_topic_keywords[negative_topic_keywords['Topic'] == topic_id]['Name'].values[0]
    negative_combined_summaries.append({
        "topic_id": topic_id,
        "keywords": keywords,
        "summary": summary,
        "sentiment": "negative"
    })

In [25]:
negative_combined_summaries_df=pd.DataFrame(negative_combined_summaries)

In [26]:
negative_combined_summaries_df.head()

Unnamed: 0,topic_id,keywords,summary,sentiment
0,-1,-1_work_company_people_good,Pay bank feel like future pay raise job dependent senior view business year time good cut cost time bad cut cost expect ridiculous bonus certain function kid fresh college year experience get automatic promotion senior leadership function people year work experience great college work role promote senior leadership .,negative
1,0,0_culture_company_amazing_product,place run ego manage director rely rely title ego tend overlook dismiss dismiss idea beneath md line business functional team director try manage director coast senior leadership leader division suppose cultivate meaningful relationship business functional leader choose throw group chaos involve low level decision despite senior leadership desire push decision .,negative
2,1,1_promotion_raise_promote_promotion raise,salary compensation model erode base salary allow partner flex compensation word lean time experience hire underpaid start low low low compare peer good bright leave hit management lack true vision upper level mean little leave learn stick partner firm firm high demand Managers staff level especially level desire free time marginal raise partner sm significant promise significant promise forget writing proof intention learn move .,negative
3,2,2_vacation_sick_benefit_insurance,Layoffs appear mainly talented lifer hire appear young inexperienced resource time tell change work week layoff company change severance policy week pay week pay year service week severance matter long year week pay great concern interested company new company change stock artificially inflate recent layoff thousand likely concern .,negative
4,3,3_benefit_salary_compensation_salary benefit,long hour especially group intern new hire ware work busy season actually end ve hear people pull nighter filing deadline bonus staff senior year mediocre public accounting bonus prepare base salary year matching year employment outdated slow company technology laptop not handle large excel file deal single day time expense reporting system .,negative


# Summarize Topics- Positive Comments

In [None]:
positive_representatives = topic_model.get_representative_docs()

positive_summaries = {}
for topic_id, comments in positive_representatives.items():
    comments_text = " ".join(comments)
    summary = summarizer(comments_text, max_length=150, min_length=30, do_sample=False)
    positive_summaries[topic_id] = summary[0]['summary_text']

Your max_length is set to 150, but your input_length is only 129. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 150, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 150, but your input_length is only 70. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 150, but your input_length is only 138. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
Y

In [31]:
# Create data frame to aggregate the keywords derived from positive comments, and summarize it
positive_topic_keywords = topic_model.get_topic_info()
positive_combined_summaries=[]
for topic_id, summary in positive_summaries.items():
    keywords = positive_topic_keywords[positive_topic_keywords['Topic'] == topic_id]['Name'].values[0]
    positive_combined_summaries.append({
        "topic_id": topic_id,
        "keywords": keywords,
        "summary": summary,
        "sentiment": "positive"
  })

In [32]:
positive_combined_summaries_df=pd.DataFrame(positive_combined_summaries)

In [33]:
positive_combined_summaries_df.head()

Unnamed: 0,topic_id,keywords,summary,sentiment
0,-1,-1_work_company_people_good,great place start career provide good experience exposure upper management client work challenge learn lot quickly people doubt smart hard work people know know fun build collaborative team environment skill learn learn learn fast big thing take away company number question see know solve figure company .,positive
1,0,0_culture_company_amazing_product,amazing culture company truly unique culture drive outstanding people work company work company people bright humble approachable passion drive result have fun work challenge stretch everyday company .,positive
2,1,1_promotion_raise_promote_promotion raise,Workforce department understaffed create volatile environment management equip handle schedule schedule vacation month advance near impossible staffing issue management intend alleviate pressure hire people fear eventual erosion .,positive
3,2,2_vacation_sick_benefit_insurance,office company great good benefit vacation day unlimited sick day stress free couple time year work life balance unlimited sick time time time sick day time . year senior leadership level week medical benefit recently well health industry initiative .,positive
4,3,3_benefit_salary_compensation_salary benefit,Base salary benefit package bit underwhelming base salary benefit pretty outstanding good see advertise total compensation package generous true consider multiply salary drop pension account let grow mutual fund .,positive
