In [5]:
# Final LLM Assessment Project - Employee Email Sentiment Analysis
# Author: Rushi Kadam
# Submitted to: Springer Capital
# Description: End-to-end project to analyze employee sentiment using NLP techniques.

# ===================================
# STEP 0: IMPORTS AND ENV SETUP
# ===================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from textblob import TextBlob
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

# For display
import warnings
warnings.filterwarnings("ignore")

# ===================================
# STEP 1: LOAD & PREPROCESS DATA
# ===================================

# Load the dataset
file_path = "test(in).csv"
df = pd.read_csv(file_path)

# Parse date column
df['date'] = pd.to_datetime(df['date'])

# Clean text data
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

df['clean_body'] = df['body'].apply(clean_text)
df['clean_subject'] = df['Subject'].apply(clean_text)
df['full_text'] = df['clean_subject'] + " " + df['clean_body']

# ===================================
# STEP 2: SENTIMENT ANALYSIS USING TEXTBLOB
# ===================================

def get_sentiment_label(polarity):
    if polarity > 0.1:
        return "Positive"
    elif polarity < -0.1:
        return "Negative"
    else:
        return "Neutral"

# Apply TextBlob
sentiment = df['full_text'].apply(lambda x: TextBlob(x).sentiment)
df['polarity'] = sentiment.apply(lambda x: x.polarity)
df['subjectivity'] = sentiment.apply(lambda x: x.subjectivity)
df['sentiment'] = df['polarity'].apply(get_sentiment_label)

# ===================================
# STEP 3: EDA & VISUALIZATIONS
# ===================================

# Plot sentiment distribution
sns.countplot(data=df, x='sentiment')
plt.title("Sentiment Distribution")
plt.savefig("sentiment_distribution.png")
plt.clf()

# Monthly sentiment trend
df['month'] = df['date'].dt.to_period('M')
monthly_sentiment = df.groupby('month')['polarity'].mean().reset_index()
monthly_sentiment['month'] = monthly_sentiment['month'].astype(str)

plt.figure(figsize=(10, 4))
plt.plot(monthly_sentiment['month'], monthly_sentiment['polarity'], marker='o')
plt.xticks(rotation=45)
plt.title("Monthly Sentiment Trend")
plt.tight_layout()
plt.savefig("monthly_sentiment_trend.png")
plt.clf()

# WordCloud for Positive and Negative
positive_text = " ".join(df[df['sentiment'] == 'Positive']['full_text'])
wordcloud = WordCloud(width=800, height=400).generate(positive_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud - Positive")
plt.savefig("wordcloud_positive.png")
plt.clf()

# ===================================
# STEP 4: EMPLOYEE RANKING & FLIGHT RISK
# ===================================

# Ranking by sentiment
df['from'] = df['from'].str.lower()
emp_sentiment = df.groupby('from')['polarity'].agg(['count', 'mean']).reset_index()
emp_sentiment.columns = ['employee', 'email_count', 'avg_sentiment']
emp_sentiment_sorted = emp_sentiment.sort_values(by=['avg_sentiment'], ascending=True)

# Flight risk: Low average sentiment AND high email count
flight_risk = emp_sentiment_sorted[(emp_sentiment_sorted['avg_sentiment'] < -0.1) & (emp_sentiment_sorted['email_count'] > 5)]
flight_risk.to_csv("flight_risk_employees.csv", index=False)

# ===================================
# STEP 5: LINEAR REGRESSION FOR TREND
# ===================================

monthly_sentiment['month_number'] = range(len(monthly_sentiment))
X = monthly_sentiment[['month_number']]
y = monthly_sentiment['polarity']
model = LinearRegression()
model.fit(X, y)
monthly_sentiment['trend'] = model.predict(X)

# Plot trend
plt.plot(monthly_sentiment['month'], monthly_sentiment['polarity'], label='Actual')
plt.plot(monthly_sentiment['month'], monthly_sentiment['trend'], linestyle='--', label='Trend')
plt.xticks(rotation=45)
plt.legend()
plt.title("Linear Regression - Sentiment Trend")
plt.tight_layout()
plt.savefig("regression_trend.png")
plt.clf()

# ===================================
# END OF NOTEBOOK - EXPORT FINAL FILES
# ===================================

# Save processed data
df.to_csv("processed_sentiment_data.csv", index=False)

print("✅ Project complete. All files saved.")


✅ Project complete. All files saved.


<Figure size 640x480 with 0 Axes>

<Figure size 1000x400 with 0 Axes>