In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

# ------------------------------
# 1. SETUP
# ------------------------------
sns.set(style="whitegrid")
os.makedirs("visualization", exist_ok=True)

# ------------------------------
# 2. LOAD DATA
# ------------------------------
df = pd.read_csv("data/test.csv", parse_dates=["date"])
df.rename(columns={"date": "Date", "body": "Message", "from": "employee"}, inplace=True)
df['Message'] = df['Message'].astype(str)
df.dropna(subset=["Message", "Date", "employee"], inplace=True)

# ------------------------------
# 3. SENTIMENT LABELING
# ------------------------------
# Note: Justified thresholds based on preliminary inspection and FAQ guidance.
def get_sentiment(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0.2:
        return "Positive"
    elif polarity < -0.2:
        return "Negative"
    else:
        return "Neutral"

df["Sentiment"] = df["Message"].apply(get_sentiment)

# Manual verification sample (as per FAQ guidance)
print("Sample Sentiment Checks:")
print(df[["Message", "Sentiment"]].sample(5))

# ------------------------------
# 4. VISUALIZATION
# ------------------------------
sentiment_counts = df["Sentiment"].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Emails")
plt.tight_layout()
plt.savefig("visualization/sentiment_distribution.png")
plt.close()

# ------------------------------
# 5. FEATURE EXTRACTION
# ------------------------------
df["MessageLength"] = df["Message"].apply(len)
df["WordCount"] = df["Message"].apply(lambda x: len(x.split()))

# ------------------------------
# 6. TF-IDF FOR PREDICTION TASK (with model comparison and feature importance)
# ------------------------------
vectorizer = TfidfVectorizer(stop_words="english", max_features=500)
X = vectorizer.fit_transform(df["Message"])
y = df["MessageLength"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Ridge": Ridge(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "SVR (RBF)": SVR(kernel='rbf')
}

best_model = None
best_r2 = float('-inf')

print("\nModel Performance Comparison:")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: RMSE = {rmse:.2f}, R² = {r2:.3f}")
    if r2 > best_r2:
        best_r2 = r2
        best_model = model

# Visualize feature importances if available
if hasattr(best_model, "feature_importances_"):
    importances = best_model.feature_importances_
    features = np.array(vectorizer.get_feature_names_out())
    indices = np.argsort(importances)[-20:][::-1]
    plt.figure(figsize=(10, 5))
    sns.barplot(x=importances[indices], y=features[indices], palette="crest")
    plt.title("Top 20 Feature Importances (Best Model)")
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.savefig("visualization/feature_importances.png")
    plt.close()

# ------------------------------
# 7. FLIGHT RISK IDENTIFICATION
# ------------------------------
df["is_negative"] = df["Sentiment"] == "Negative"
df_sorted = df.sort_values(["employee", "Date"]).set_index("Date")
df_sorted["is_negative"] = df["is_negative"].values  # Align values

def compute_rolling_negatives(group):
    return group["is_negative"].rolling("30D").sum()

# Cross-verifying intermediate output as per FAQ
df["RollingNegatives"] = (
    df_sorted.groupby("employee", group_keys=False)
    .apply(compute_rolling_negatives)
    .reset_index(drop=True)
)

flight_risks = df[df["RollingNegatives"] >= 4]["employee"].unique()
print("\nPotential Flight Risk Employees (≥4 negative emails in 30 days):")
print(flight_risks)

# ------------------------------
# 8. EMPLOYEE RANKING (Monthly Sentiment Scoring)
# ------------------------------
df["Month"] = df["Date"].dt.to_period("M")
df["Score"] = df["Sentiment"].map({"Positive": 1, "Neutral": 0, "Negative": -1})
monthly_scores = df.groupby(["employee", "Month"]).agg({"Score": "sum"}).reset_index()

# Top 3 positive and negative
for month in monthly_scores["Month"].unique():
    subset = monthly_scores[monthly_scores["Month"] == month]
    top_positive = subset.sort_values(by=["Score", "employee"], ascending=[False, True]).head(3)
    top_negative = subset.sort_values(by=["Score", "employee"], ascending=[True, True]).head(3)
    print(f"\nMonth: {month}")
    print("Top 3 Positive Employees:")
    print(top_positive)
    print("Top 3 Negative Employees:")
    print(top_negative)

# ------------------------------
# 9. VISUALIZATIONS - Additional Plots
# ------------------------------
# Monthly average sentiment trend
monthly_trend = df.groupby("Month")["Score"].mean()
plt.figure(figsize=(8, 4))
monthly_trend.plot(marker='o')
plt.title("Average Monthly Sentiment Trend")
plt.ylabel("Average Sentiment Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("visualization/monthly_sentiment_trend.png")
plt.close()

# Top employees with most negative messages
top_neg = df[df["Sentiment"] == "Negative"]["employee"].value_counts().head(5)
plt.figure(figsize=(6, 4))
sns.barplot(x=top_neg.index, y=top_neg.values, palette="Reds_r")
plt.title("Top Employees by Negative Messages")
plt.ylabel("Negative Message Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("visualization/top_negative_employees.png")
plt.close()


Sample Sentiment Checks:
                                                Message Sentiment
760   Your passwords are extended, or if you did not...   Neutral
1543  <http://www.txtreasure.com/>\n \nwhat do you t...   Neutral
1262  Don:\n\nThanks for your Email update yesterday...   Neutral
1528  Happy Hour tonight  @ Kenneally's @ 5:00.\n\nL...  Positive
1584  Diana,\n\nHere's info you requested.  Per our ...   Neutral

Model Performance Comparison:
Ridge: RMSE = 118.11, R² = 0.754



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")


RandomForest: RMSE = 106.37, R² = 0.801
GradientBoosting: RMSE = 113.98, R² = 0.771
SVR (RBF): RMSE = 247.23, R² = -0.076

Potential Flight Risk Employees (≥4 negative emails in 30 days):
[]

Month: 2010-01
Top 3 Positive Employees:
                     employee    Month  Score
144   lydia.delgado@enron.com  2010-01      4
168  patti.thompson@enron.com  2010-01      4
48        eric.bass@enron.com  2010-01      3
Top 3 Negative Employees:
                        employee    Month  Score
0    bobette.riner@ipgdirect.com  2010-01      0
96       johnny.palmer@enron.com  2010-01      0
192      rhonda.denton@enron.com  2010-01      0

Month: 2010-02
Top 3 Positive Employees:
                   employee    Month  Score
73    john.arnold@enron.com  2010-02      5
25   don.baughman@enron.com  2010-02      4
97  johnny.palmer@enron.com  2010-02      3
Top 3 Negative Employees:
                        employee    Month  Score
217         sally.beck@enron.com  2010-02     -1
1    bobette.riner@

  .apply(compute_rolling_negatives)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_neg.index, y=top_neg.values, palette="Reds_r")
