In [4]:
import pandas as pd

def classify_stress_level(row):
    # Stress indicators (higher values = more stress)
    stress_score = (
        row['lex_liwc_anx'] * 5.0 +       # Anxiety (high weight)
        row['lex_liwc_anger'] * 4.5 +     # Anger
        row['lex_liwc_sad'] * 4.0 +       # Sadness
        row['lex_liwc_negemo'] * 3.5 +    # Negative emotion
        row['lex_liwc_cogproc'] * 2.5 +   # Cognitive processing (rumination)
        row['lex_liwc_health'] * 2.0 +    # Health concerns
        row['lex_liwc_social'] * 1.8 +    # Social references
        row['lex_dal_avg_pleasantness'] * (-2.0) # DAL: Lower pleasantness increases stress
    )
    
    # Relief factors (higher values = less stress)
    relief_score = (
        row['lex_liwc_posemo'] * (-1.0) + # Reduced weight for positive emotion
        row['lex_liwc_Authentic'] * (-0.8) # Reduced weight for authenticity
    )
    
    # Final score (stress_score - relief_score)
    score = stress_score + relief_score
    
    # Adjusted thresholds to balance distribution
    if score > 30:
        return 'Very High Stress'
    elif score > 18:
        return 'High Stress'
    elif score > 8:
        return 'Moderate Stress'
    elif score > -2:
        return 'Low Stress'
    else:
        return 'Very Low Stress'

# Load data
file_path = "C:/Users/WinX/Downloads/train_data.csv"
df = pd.read_csv(file_path)

# Apply classification
df['stress_level'] = df.apply(classify_stress_level, axis=1)

# Save results
output_file_path = 'updated_dataset_with_stress_levels.csv'
df.to_csv(output_file_path, index=False)
print(f"Updated dataset with stress levels saved to: {output_file_path}")

Updated dataset with stress levels saved to: updated_dataset_with_stress_levels.csv


In [2]:
!pip install empath

Collecting empath
  Using cached empath-0.89.tar.gz (57 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: empath
  Building wheel for empath (pyproject.toml): started
  Building wheel for empath (pyproject.toml): finished with status 'done'
  Created wheel for empath: filename=empath-0.89-py3-none-any.whl size=57849 sha256=a7a83b7c652e8ab0e3b57736ccad4d0ed88746655f324bb26b73143f77b2b200
  Stored in directory: c:\users\winx\appdata\local\pip\cache\wheels\92\b3\83\9eb2c6199881e2385a59d99bd911363475060ebeb4bdb27242
Successfully built empath
Installing collected packages: empath
Successfully installed empath-0.89




In [3]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, nltk
Successfully installed joblib-1.4.2 nltk-3.9.1




In [5]:
from empath import Empath
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [6]:
# Download necessary NLTK data
nltk.download('vader_lexicon')
nltk.download('punkt')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\WinX\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WinX\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Initialize tools
lexicon = Empath()
vader = SentimentIntensityAnalyzer()

In [8]:
def calculate_liwc_features(text):
    analysis = lexicon.analyze(text, normalize=True)
    sentiment = vader.polarity_scores(text)
    
    features = {
        'lex_liwc_anx': analysis['nervousness'],
        'lex_liwc_anger': analysis['anger'],
        'lex_liwc_sad': analysis['sadness'],
        'lex_liwc_negemo': sentiment['neg'],
        'lex_liwc_posemo': sentiment['pos'],
        'lex_liwc_cogproc': analysis['cognitive_mechanics'],
        'lex_liwc_health': analysis['health'],
        'lex_liwc_social': analysis['social'],
        'lex_liwc_Authentic': analysis['honesty'],
        'lex_dal_avg_pleasantness': sentiment['compound']
    }
    return features


In [9]:
def classify_stress(text):
    row = calculate_liwc_features(text)
    stress_score = (
        row['lex_liwc_anx'] * 5.0 +
        row['lex_liwc_anger'] * 4.5 +
        row['lex_liwc_sad'] * 4.0 +
        row['lex_liwc_negemo'] * 3.5 +
        row['lex_liwc_cogproc'] * 2.5 +
        row['lex_liwc_health'] * 2.0 +
        row['lex_liwc_social'] * 1.8 +
        row['lex_dal_avg_pleasantness'] * (-2.0)
    )
    relief_score = (
        row['lex_liwc_posemo'] * (-1.0) +
        row['lex_liwc_Authentic'] * (-0.8)
    )
    score = stress_score + relief_score
    
    if score > 30:
        return 'Very High Stress'
    elif score > 18:
        return 'High Stress'
    elif score > 8:
        return 'Moderate Stress'
    elif score > -2:
        return 'Low Stress'
    else:
        return 'Very Low Stress'


In [None]:
# Test with a sample sentence
text = "I am really anxious about my health and feeling overwhelmed by work."
print(f"Stress Level: {classify_stress(text)}")


In [11]:
from empath import Empath
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')
nltk.download('punkt')

lexicon = Empath()
vader = SentimentIntensityAnalyzer()

def calculate_liwc_features(text):
    analysis = lexicon.analyze(text, normalize=True)
    sentiment = vader.polarity_scores(text)

    features = {
        'lex_liwc_anx': analysis.get('nervousness', 0.0),
        'lex_liwc_anger': analysis.get('anger', 0.0),
        'lex_liwc_sad': analysis.get('sadness', 0.0),
        'lex_liwc_negemo': sentiment.get('neg', 0.0),
        'lex_liwc_posemo': sentiment.get('pos', 0.0),
        'lex_liwc_cogproc': analysis.get('thinking', 0.0),
        'lex_liwc_health': analysis.get('health', 0.0),
        'lex_liwc_social': analysis.get('social', 0.0),
        'lex_liwc_Authentic': analysis.get('trust', 0.0),
        'lex_dal_avg_pleasantness': sentiment.get('compound', 0.0)
    }
    return features

def classify_stress(text):
    row = calculate_liwc_features(text)
    stress_score = (
        row['lex_liwc_anx'] * 6.0 +
        row['lex_liwc_anger'] * 5.5 +
        row['lex_liwc_sad'] * 5.0 +
        row['lex_liwc_negemo'] * 4.5 +
        row['lex_liwc_cogproc'] * 3.5 +
        row['lex_liwc_health'] * 3.0 +
        row['lex_liwc_social'] * 2.5 +
        row['lex_dal_avg_pleasantness'] * (-3.0)
    )
    relief_score = (
        row['lex_liwc_posemo'] * (-1.5) +
        row['lex_liwc_Authentic'] * (-1.2)
    )
    score = stress_score + relief_score

    if score > 20:
        return 'Very High Stress'
    elif score > 12:
        return 'High Stress'
    elif score > 4:
        return 'Moderate Stress'
    elif score > -5:
        return 'Low Stress'
    else:
        return 'Very Low Stress'

text = "I am really anxious about my health and feeling overwhelmed by work."
print(f"Stress Level: {classify_stress(text)}")


Stress Level: Low Stress


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\WinX\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WinX\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
!pip install scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("d1.csv")

# Extract text and labels
texts = df['text'].tolist()
labels = df['stress_level'].map({
    "Very Low Stress": 0,
    "Low Stress": 1,
    "Moderate Stress": 2,
    "High Stress": 3,
    "Very High Stress": 4
}).tolist()

# Split data (stratified)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)



Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.1 MB 2.2 MB/s eta 0:00:05
   ---- ----------------------------------- 1.3/11.1 MB 2.2 MB/s eta 0:00:05
   ------ --------------------------------- 1.8/11.1 MB 2.3 MB/s eta 0:00:04
   -------- ------------------------------- 2.4/11.1 MB 2.4 MB/s eta 0:00:04
   ---------- ----------------------------- 2.9/11.1 MB 2.5 MB/s eta 0:00:04
   ------------- -------------------------- 3.7/11.1 MB 2.5 MB/s eta 0:00

In [2]:
from transformers import AutoTokenizer

# Use DistilBERT (smaller than BERT/ROBERTA)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize with shorter max_length (reduces GPU memory)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from torch.utils.data import Dataset

class StressDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = StressDataset(train_encodings, train_labels)
test_dataset = StressDataset(test_encodings, test_labels)

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load DistilBERT model with 5 classes
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5
)

# Training arguments (optimized for low memory)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Matches compute_metrics
    greater_is_better=True,
    report_to="none",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # Add this line
)

# Train
trainer.train()

In [24]:
# Evaluate on test data
results = trainer.evaluate()
print("Evaluation Results:", results)

100%|██████████| 57/57 [00:02<00:00, 19.17it/s]

Evaluation Results: {'eval_loss': 1.0518702268600464, 'eval_accuracy': 0.6189427312775331, 'eval_runtime': 3.3101, 'eval_samples_per_second': 137.158, 'eval_steps_per_second': 17.22, 'epoch': 2.0}





In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

In [None]:
# Predict on test dataset
test_predictions = trainer.predict(test_dataset)
logits, labels = test_predictions.predictions, test_predictions.label_ids
predicted_labels = np.argmax(logits, axis=-1)

# Generate classification report
report = classification_report(
    labels, predicted_labels, 
    target_names=[
        "Very Low Stress", "Low Stress", "Moderate Stress",
        "High Stress", "Very High Stress"
    ]
)
print("Classification Report:\n", report)

# Calculate accuracy
accuracy = accuracy_score(labels, predicted_labels)
print(f"Test Accuracy: {accuracy:.4f}")

In [25]:
# Save model and tokenizer
model.save_pretrained("./saved_bert_model")
tokenizer.save_pretrained("./saved_bert_model")

('./saved_bert_model\\tokenizer_config.json',
 './saved_bert_model\\special_tokens_map.json',
 './saved_bert_model\\vocab.txt',
 './saved_bert_model\\added_tokens.json',
 './saved_bert_model\\tokenizer.json')

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./saved_bert_model")
tokenizer = AutoTokenizer.from_pretrained("./saved_bert_model")

def predict_stress(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    # Predict
    outputs = model(**inputs)
    predicted_label = outputs.logits.argmax(-1).item()
    # Map label index to stress level
    label_map = {
        0: "Very Low Stress",
        1: "Low Stress",
        2: "Moderate Stress",
        3: "High Stress",
        4: "Very High Stress"
    }
    return label_map[predicted_label]

# Example usage
new_text = "The sun is shining, the breeze is gentle, and everything is moving at its own pace."
print(predict_stress(new_text))  # Output: "High Stress"

Very Low Stress
