# Employee Sentiment Analysis Project

## Project Overview
This notebook implements a comprehensive sentiment analysis system for employee messages. The project includes:
1. **Sentiment Labeling**: Automatically label messages as Positive, Negative, or Neutral
2. **Exploratory Data Analysis (EDA)**: Analyze and visualize data patterns
3. **Employee Score Calculation**: Compute monthly sentiment scores
4. **Employee Ranking**: Identify top positive and negative employees
5. **Flight Risk Identification**: Detect employees at risk of leaving
6. **Predictive Modeling**: Build linear regression model to predict sentiment scores

---


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os
from datetime import datetime

# Add src directory to path
# When running from notebooks/, src is at ../src
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
src_path = os.path.join(project_root, 'src')

# Try multiple path options
if os.path.exists(src_path):
    sys.path.insert(0, src_path)
elif os.path.exists(os.path.join('..', 'src')):
    sys.path.insert(0, os.path.join('..', 'src'))
else:
    # Try absolute path from current location
    abs_src = os.path.abspath(os.path.join('..', 'src'))
    if os.path.exists(abs_src):
        sys.path.insert(0, abs_src)

# Import custom modules
try:
    from sentiment_labeling import SentimentLabeler
    from eda import EDAAnalyzer
    from scoring import EmployeeScorer
    from flight_risk import FlightRiskAnalyzer
    from modeling import SentimentPredictor
    print("✓ Custom modules imported successfully")
except ImportError as e:
    print(f"✗ Error importing custom modules: {e}")
    print(f"  Current directory: {os.getcwd()}")
    print(f"  Tried src path: {src_path}")
    print(f"  Python path: {sys.path}")
    raise

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")
print(f"  Working directory: {os.getcwd()}")
print(f"  Source path: {src_path if os.path.exists(src_path) else 'Not found'}")


## Task 1: Data Loading and Initial Exploration

First, let's load the dataset and examine its structure.


In [None]:
# Load the dataset
data_path = os.path.join('..', 'data', 'test.csv')

try:
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"\nFirst few rows:")
    print(df.head())
    print(f"\nColumn names: {df.columns.tolist()}")
    print(f"\nData types:")
    print(df.dtypes)
    print(f"\nMissing values:")
    print(df.isnull().sum())
except FileNotFoundError:
    print(f"Error: File not found at {data_path}")
    print("Please ensure test.csv is in the data/ directory")
except Exception as e:
    print(f"Error loading dataset: {e}")


## Task 2: Sentiment Labeling

**Objective**: Label each employee message with one of three sentiment categories: Positive, Negative, or Neutral.

**Approach**: We'll use VADER (Valence Aware Dictionary and sEntiment Reasoner) sentiment analyzer, which is well-suited for social media text and short messages. VADER is fast, doesn't require training data, and works well with informal text.

**Alternative**: We could also use transformer-based models (like RoBERTa) for potentially better accuracy, but VADER is faster and sufficient for this task.


In [None]:
# Initialize sentiment labeler
# Using VADER for speed and efficiency
# For better accuracy, you can use method='transformer' (requires GPU)
labeler = SentimentLabeler(method='vader', use_gpu=False)

# Label all messages
df_labeled = labeler.label_dataframe(df, text_column='message')

# Display results
print("\n" + "="*60)
print("SENTIMENT LABELING RESULTS")
print("="*60)
print(f"\nTotal messages labeled: {len(df_labeled)}")
print(f"\nSentiment distribution:")
print(df_labeled['sentiment'].value_counts())
print(f"\nSentiment percentages:")
print(df_labeled['sentiment'].value_counts(normalize=True) * 100)

# Show sample labeled messages
print("\n" + "="*60)
print("SAMPLE LABELED MESSAGES")
print("="*60)
sample_df = df_labeled[['employee', 'date', 'message', 'sentiment']].head(10)
print(sample_df.to_string(index=False))


## Task 3: Exploratory Data Analysis (EDA)

**Objective**: Understand the structure, distribution, and trends in the dataset through thorough exploration.

**Key Areas of Analysis**:
- Overall data structure and quality
- Sentiment distribution across the dataset
- Temporal trends (over time)
- Employee activity patterns
- Message characteristics


In [None]:
# Initialize EDA analyzer
eda = EDAAnalyzer(df_labeled)

# Basic dataset information
print("="*60)
print("EXPLORATORY DATA ANALYSIS")
print("="*60)
basic_info = eda.basic_info()


In [None]:
# Sentiment distribution visualization
print("\n" + "="*60)
print("SENTIMENT DISTRIBUTION ANALYSIS")
print("="*60)
sentiment_counts = eda.sentiment_distribution(
    save_path=os.path.join('..', 'visualizations', 'sentiment_distribution.png')
)


In [None]:
# Temporal trends analysis
print("\n" + "="*60)
print("TEMPORAL TRENDS ANALYSIS")
print("="*60)
eda.temporal_trends(
    save_path=os.path.join('..', 'visualizations', 'sentiment_trends.png')
)


In [None]:
# Employee activity analysis
print("\n" + "="*60)
print("EMPLOYEE ACTIVITY ANALYSIS")
print("="*60)
eda.employee_activity(
    top_n=10,
    save_path=os.path.join('..', 'visualizations', 'employee_activity.png')
)


In [None]:
# Message length analysis
print("\n" + "="*60)
print("MESSAGE LENGTH ANALYSIS")
print("="*60)
eda.message_length_analysis(
    save_path=os.path.join('..', 'visualizations', 'message_length_analysis.png')
)


In [None]:
# Generate comprehensive summary
print("\n" + "="*60)
print("EDA SUMMARY REPORT")
print("="*60)
summary = eda.generate_summary_report()
for key, value in summary.items():
    print(f"\n{key}: {value}")


## Task 4: Employee Score Calculation

**Objective**: Compute a monthly sentiment score for each employee based on their messages.

**Scoring System**:
- Positive Message: +1
- Negative Message: -1
- Neutral Message: 0 (no effect)

**Method**: Scores are aggregated on a monthly basis for each employee, resetting at the beginning of each new month.


In [None]:
# Initialize employee scorer
scorer = EmployeeScorer(df_labeled)

# Calculate monthly scores
print("="*60)
print("EMPLOYEE SCORE CALCULATION")
print("="*60)
monthly_scores = scorer.calculate_scores()

print(f"\nMonthly scores calculated for {len(monthly_scores)} employee-month combinations")
print(f"\nSample monthly scores:")
print(monthly_scores.head(20).to_string(index=False))

print(f"\nScore statistics:")
print(monthly_scores['monthly_score'].describe())


In [None]:
# Display scores by employee
print("\n" + "="*60)
print("MONTHLY SCORES BY EMPLOYEE")
print("="*60)
employee_col = monthly_scores.columns[0]  # Get employee column name

for employee in monthly_scores[employee_col].unique():
    emp_scores = monthly_scores[monthly_scores[employee_col] == employee]
    print(f"\n{employee}:")
    for _, row in emp_scores.iterrows():
        print(f"  {row['year_month']}: Score = {row['monthly_score']} "
              f"(Messages: {row.get('message_count', 'N/A')}, "
              f"Positive: {row.get('positive_count', 'N/A')}, "
              f"Negative: {row.get('negative_count', 'N/A')})")


## Task 5: Employee Ranking

**Objective**: Generate ranked lists of employees based on their monthly sentiment scores.

**Requirements**:
- Top Three Positive Employees: Highest positive scores per month
- Top Three Negative Employees: Lowest (most negative) scores per month
- Sorted in descending order by score, then alphabetically


In [None]:
# Function to get top employees per month
def get_top_employees(monthly_scores, top_n=3, positive=True):
    """
    Get top N employees per month based on sentiment scores
    
    Parameters:
    -----------
    monthly_scores : pd.DataFrame
        DataFrame with monthly scores
    top_n : int
        Number of top employees to return
    positive : bool
        If True, return top positive; if False, return top negative
    """
    employee_col = monthly_scores.columns[0]
    results = []
    
    for year_month in monthly_scores['year_month'].unique():
        month_data = monthly_scores[monthly_scores['year_month'] == year_month].copy()
        
        if positive:
            # Top positive (highest scores)
            top_employees = month_data.nlargest(top_n, 'monthly_score')
        else:
            # Top negative (lowest scores)
            top_employees = month_data.nsmallest(top_n, 'monthly_score')
        
        # Sort by score (descending) then alphabetically
        top_employees = top_employees.sort_values(
            ['monthly_score', employee_col], 
            ascending=[False, True]
        )
        
        for _, row in top_employees.iterrows():
            results.append({
                'year_month': year_month,
                employee_col: row[employee_col],
                'monthly_score': row['monthly_score'],
                'message_count': row.get('message_count', 'N/A'),
                'positive_count': row.get('positive_count', 'N/A'),
                'negative_count': row.get('negative_count', 'N/A')
            })
    
    return pd.DataFrame(results)

# Get top positive employees
print("="*60)
print("TOP THREE POSITIVE EMPLOYEES BY MONTH")
print("="*60)
top_positive = get_top_employees(monthly_scores, top_n=3, positive=True)
print(top_positive.to_string(index=False))

# Get top negative employees
print("\n" + "="*60)
print("TOP THREE NEGATIVE EMPLOYEES BY MONTH")
print("="*60)
top_negative = get_top_employees(monthly_scores, top_n=3, positive=False)
print(top_negative.to_string(index=False))


In [None]:
# Visualize employee rankings
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top positive employees visualization
employee_col = monthly_scores.columns[0]
for year_month in monthly_scores['year_month'].unique():
    month_data = monthly_scores[monthly_scores['year_month'] == year_month]
    top_pos = month_data.nlargest(3, 'monthly_score').sort_values(
        ['monthly_score', employee_col], ascending=[False, True]
    )
    
    if len(top_pos) > 0:
        axes[0].barh(
            range(len(top_pos)), 
            top_pos['monthly_score'].values,
            label=str(year_month)
        )

axes[0].set_title('Top 3 Positive Employees by Month', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Monthly Score', fontsize=12)
axes[0].set_ylabel('Rank', fontsize=12)
axes[0].legend(title='Month', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].grid(axis='x', alpha=0.3)

# Top negative employees visualization
for year_month in monthly_scores['year_month'].unique():
    month_data = monthly_scores[monthly_scores['year_month'] == year_month]
    top_neg = month_data.nsmallest(3, 'monthly_score').sort_values(
        ['monthly_score', employee_col], ascending=[True, True]
    )
    
    if len(top_neg) > 0:
        axes[1].barh(
            range(len(top_neg)), 
            top_neg['monthly_score'].values,
            label=str(year_month)
        )

axes[1].set_title('Top 3 Negative Employees by Month', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Monthly Score', fontsize=12)
axes[1].set_ylabel('Rank', fontsize=12)
axes[1].legend(title='Month', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(
    os.path.join('..', 'visualizations', 'employee_rankings.png'),
    dpi=300,
    bbox_inches='tight'
)
print("Ranking visualization saved!")
plt.show()


## Task 6: Flight Risk Identification

**Objective**: Identify employees who are at risk of leaving based on their monthly sentiment scores.

**Criteria**: A Flight risk is any employee who has sent 4 or more negative messages in a rolling 30-day period (irrespective of months).

**Method**: We use a rolling window approach to check for 30-day periods where an employee has 4+ negative messages.


In [None]:
# Initialize flight risk analyzer
flight_risk_analyzer = FlightRiskAnalyzer(df_labeled)

# Identify flight risks
print("="*60)
print("FLIGHT RISK IDENTIFICATION")
print("="*60)
flight_risks = flight_risk_analyzer.identify_flight_risks(threshold=4, window_days=30)

if len(flight_risks) > 0:
    print(f"\nFound {len(flight_risks)} employee(s) at flight risk:")
    print("\n" + flight_risks.to_string(index=False))
    
    # Get summary
    summary = flight_risk_analyzer.get_flight_risk_summary(threshold=4, window_days=30)
    print("\n" + "="*60)
    print("FLIGHT RISK SUMMARY")
    print("="*60)
    for key, value in summary.items():
        print(f"{key}: {value}")
else:
    print("\nNo employees identified as flight risks.")
    print("(No employee has 4+ negative messages in a 30-day rolling window)")


In [None]:
# Store flight risk list for final report
if len(flight_risks) > 0:
    employee_col = flight_risks.columns[0]
    flight_risk_employees = flight_risks[employee_col].tolist()
    print(f"\nFlight Risk Employees List: {flight_risk_employees}")
else:
    flight_risk_employees = []
    print("\nNo flight risk employees identified.")


## Task 7: Predictive Modeling

**Objective**: Develop a linear regression model to analyze sentiment trends and predict sentiment scores.

**Features Used**:
- Message frequency in a month
- Average message length
- Total message length
- Average word count
- Total word count
- Positive/negative/neutral ratios
- Month (temporal feature)

**Evaluation**: We'll use train-test split and evaluate using MSE, RMSE, MAE, and R² metrics.


In [None]:
# Initialize predictor
predictor = SentimentPredictor(df_labeled, monthly_scores)

# Build and train the model
print("="*60)
print("PREDICTIVE MODELING - LINEAR REGRESSION")
print("="*60)
print("\nBuilding model...")
metrics = predictor.build_model(test_size=0.2, random_state=42)

print("\n" + "="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print("\nTraining Set Metrics:")
for metric, value in metrics['train'].items():
    print(f"  {metric.upper()}: {value:.4f}")

print("\nTest Set Metrics:")
for metric, value in metrics['test'].items():
    print(f"  {metric.upper()}: {value:.4f}")

print("\n" + "="*60)
print("FEATURE IMPORTANCE")
print("="*60)
feature_importance = pd.Series(metrics['feature_importance']).sort_values(ascending=False)
for feature, importance in feature_importance.items():
    print(f"  {feature}: {importance:.4f}")


In [None]:
# Visualize model performance
print("\n" + "="*60)
print("MODEL PERFORMANCE VISUALIZATION")
print("="*60)
predictor.plot_model_performance(
    save_path=os.path.join('..', 'visualizations', 'model_performance.png')
)


In [None]:
# Get model summary
model_summary = predictor.get_model_summary()
print("\n" + "="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"Intercept: {model_summary['intercept']:.4f}")
print(f"\nNumber of features: {model_summary['n_features']}")
print(f"Training samples: {model_summary['n_train_samples']}")
print(f"Test samples: {model_summary['n_test_samples']}")
print(f"\nCoefficients:")
for feature, coef in model_summary['coefficients'].items():
    print(f"  {feature}: {coef:.4f}")


## Summary and Key Findings

Let's compile the key findings from our analysis:


In [None]:
# Compile final summary
print("="*60)
print("FINAL PROJECT SUMMARY")
print("="*60)

# Top positive employees (overall)
employee_col = monthly_scores.columns[0]
overall_positive = monthly_scores.groupby(employee_col)['monthly_score'].sum().nlargest(3)
print("\nTop 3 Positive Employees (Overall):")
for i, (employee, score) in enumerate(overall_positive.items(), 1):
    print(f"  {i}. {employee}: {score}")

# Top negative employees (overall)
overall_negative = monthly_scores.groupby(employee_col)['monthly_score'].sum().nsmallest(3)
print("\nTop 3 Negative Employees (Overall):")
for i, (employee, score) in enumerate(overall_negative.items(), 1):
    print(f"  {i}. {employee}: {score}")

# Flight risks
print(f"\nFlight Risk Employees: {len(flight_risk_employees)}")
if len(flight_risk_employees) > 0:
    for employee in flight_risk_employees:
        print(f"  - {employee}")
else:
    print("  None identified")

# Model performance
print(f"\nModel Performance (R² Score):")
print(f"  Training: {metrics['train']['r2']:.4f}")
print(f"  Test: {metrics['test']['r2']:.4f}")

# Sentiment distribution
sentiment_dist = df_labeled['sentiment'].value_counts()
print(f"\nOverall Sentiment Distribution:")
for sentiment, count in sentiment_dist.items():
    pct = (count / len(df_labeled)) * 100
    print(f"  {sentiment}: {count} ({pct:.1f}%)")

print("\n" + "="*60)
print("Analysis Complete!")
print("="*60)


In [None]:
# Save processed data for reference
output_path = os.path.join('..', 'data', 'processed_data.csv')
df_labeled.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")

# Save monthly scores
scores_path = os.path.join('..', 'data', 'monthly_scores.csv')
monthly_scores.to_csv(scores_path, index=False)
print(f"Monthly scores saved to {scores_path}")

# Save flight risks if any
if len(flight_risks) > 0:
    risks_path = os.path.join('..', 'data', 'flight_risks.csv')
    flight_risks.to_csv(risks_path, index=False)
    print(f"Flight risks saved to {risks_path}")
