In [20]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# ------------------------------
# 1. SETUP
# ------------------------------
sns.set(style="whitegrid")
os.makedirs("visualization", exist_ok=True)

# ------------------------------
# 2. LOAD DATA
# ------------------------------
df = pd.read_csv("data/test.csv", parse_dates=["date"])
df.rename(columns={"date": "Date", "body": "Message", "from": "employee"}, inplace=True)
df['Message'] = df['Message'].astype(str)
df.dropna(subset=["Message", "Date", "employee"], inplace=True)

# ------------------------------
# 3. SENTIMENT LABELING
# ------------------------------
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return "Positive"
    elif polarity < -0.1:
        return "Negative"
    else:
        return "Neutral"

df["Sentiment"] = df["Message"].apply(get_sentiment)

# ------------------------------
# 4. VISUALIZATION - Sentiment Distribution
# ------------------------------
sentiment_counts = df["Sentiment"].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Emails")
plt.tight_layout()
plt.savefig("visualization/sentiment_distribution.png")
plt.close()

# ------------------------------
# 5. MONTHLY SENTIMENT SCORE
# ------------------------------
df["Month"] = df["Date"].dt.to_period("M")
monthly_sentiment = df.groupby(["Month", "Sentiment"]).size().unstack().fillna(0)
monthly_sentiment.plot(kind="bar", stacked=True, figsize=(10, 6), colormap="Set2")
plt.title("Monthly Sentiment Volume")
plt.ylabel("Number of Emails")
plt.tight_layout()
plt.savefig("visualization/monthly_sentiment.png")
plt.close()

# ------------------------------
# 6. EMPLOYEE RANKING BY NEGATIVE EMAILS
# ------------------------------
top_negative = df[df["Sentiment"] == "Negative"]["employee"].value_counts().head(10)
plt.figure(figsize=(8, 4))
sns.barplot(y=top_negative.index, x=top_negative.values, palette="Reds_r")
plt.title("Top 10 Employees by Negative Emails")
plt.xlabel("Count")
plt.tight_layout()
plt.savefig("visualization/top_negative_employees.png")
plt.close()

# ------------------------------
# 7. LINEAR REGRESSION - Predict Message Length
# ------------------------------
df["MessageLength"] = df["Message"].apply(len)
vectorizer = TfidfVectorizer(stop_words="english", max_features=500)
X = vectorizer.fit_transform(df["Message"])
y = df["MessageLength"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = Ridge()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Model Performance:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))

# ------------------------------
# 8. FLIGHT RISK IDENTIFICATION
# ------------------------------
df["is_negative"] = df["Sentiment"] == "Negative"
df_sorted = df.sort_values(["employee", "Date"]).set_index("Date")

def compute_rolling_negatives(group):
    return group["is_negative"].rolling("30D").sum()

df["RollingNegatives"] = (
    df_sorted.groupby("employee", group_keys=False)
    .apply(compute_rolling_negatives)
    .reset_index(drop=True)
)

flight_risks = df[df["RollingNegatives"] >= 4]["employee"].unique()
print("\nPotential Flight Risk Employees (≥4 negative emails in 30 days):")
print(flight_risks)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")


Model Performance:
RMSE: 118.11317541205368
R² Score: 0.7544797953455403

Potential Flight Risk Employees (≥4 negative emails in 30 days):
['lydia.delgado@enron.com' 'don.baughman@enron.com'
 'patti.thompson@enron.com' 'rhonda.denton@enron.com'
 'bobette.riner@ipgdirect.com' 'johnny.palmer@enron.com'
 'sally.beck@enron.com' 'eric.bass@enron.com' 'john.arnold@enron.com']



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y=top_negative.index, x=top_negative.values, palette="Reds_r")
  .apply(compute_rolling_negatives)
