In [1]:
#libraries
#!pip install pandas scikit-learn matplotlib

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#dataset
df = pd.read_csv('/content/glassdoor_reviews.csv')

In [4]:
#pros and cons into single review column
df['review'] = df['pros'].fillna('') + " " + df['cons'].fillna('')

In [5]:
#overall_rating to sentiment
def map_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['overall_rating'].apply(map_sentiment)

In [6]:
#preprocessing
df = df[['review', 'sentiment']].dropna()

#train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)

In [7]:
#vectorization with expanded vocabulary + n-grams
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
#Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

#predictions
y_pred = model.predict(X_test_tfidf)

In [9]:
#accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6926908974972102
              precision    recall  f1-score   support

    negative       0.66      0.51      0.58      4364
     neutral       0.42      0.10      0.16      5375
    positive       0.72      0.95      0.82     15353

    accuracy                           0.69     25092
   macro avg       0.60      0.52      0.52     25092
weighted avg       0.64      0.69      0.63     25092



In [10]:
#stress-related words
stress_words = {
    'stress', 'stressed', 'anxious', 'overwhelmed', 'overworked', 'burned out', 'burnout',
    'pressure', 'exhausted', 'drained', 'draining', 'fatigued', 'depressed', 'tense', 'frustrated',
    'irritable', 'panicked', 'nervous', 'worried', 'uneasy', 'distraught', 'agitated',
    'restless', 'hopeless', 'disheartened', 'discouraged', 'frazzled', 'unsettled',
    'resentful', 'helpless', 'defeated', 'angsty', 'melancholic', 'disillusioned',
    'tired', 'jaded', 'beaten down', 'flustered', 'overburdened', 'fatigue', 'dismayed',
    'mentally drained', 'emotionally drained', 'physically drained', 'crushed', 'shaken',
    'lost', 'broken', 'despondent', 'distressed', 'worn out', 'shattered', 'grief-stricken'
}

In [11]:
def predict_sentiment(review):
    #Naive Bayes prediction
    review_tfidf = vectorizer.transform([review])
    prediction = model.predict(review_tfidf)[0]

    #stress-related words
    review_lower = review.lower()
    if any(word in review_lower for word in stress_words):
        print(f"Review: {review}\n Predicted Sentiment: Negative\n")
    else:
        print(f"Review: {review}\n Predicted Sentiment: {prediction}\n")

In [12]:
#test
predict_sentiment("I feel stressed and overworked all the time.")
predict_sentiment("I love the work environment and the management is supportive.")
predict_sentiment("The job is draining me emotionally and physically.")
predict_sentiment("Amazing benefits and positive work culture!")
predict_sentiment("I’m constantly on edge due to the high pressure.")
predict_sentiment("I feel burned out and exhausted.")

Review: I feel stressed and overworked all the time.
 Predicted Sentiment: Negative

Review: I love the work environment and the management is supportive.
 Predicted Sentiment: positive

Review: The job is draining me emotionally and physically.
 Predicted Sentiment: Negative

Review: Amazing benefits and positive work culture!
 Predicted Sentiment: positive

Review: I’m constantly on edge due to the high pressure.
 Predicted Sentiment: Negative

Review: I feel burned out and exhausted.
 Predicted Sentiment: Negative



In [13]:
#save
import joblib
joblib.dump(model, '/content/sentiment_model.pkl')
joblib.dump(vectorizer, '/content/tfidf_vectorizer.pkl')

['/content/tfidf_vectorizer.pkl']