In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the sentiment-labeled data
df = pd.read_csv('test_with_sentiments.csv')
df['date'] = pd.to_datetime(df['date'])
df['score'] = df['sentiment'].map({'Positive': 1, 'Negative': -1, 'Neutral': 0})
df['word_count'] = df['body'].astype(str).apply(lambda x: len(x.split()))

# --- Feature Engineering ---
# Aggregate data by employee and month
monthly_agg = df.groupby(['from', pd.Grouper(key='date', freq='M')]).agg(
    monthly_sentiment_score=('score', 'sum'),
    message_count=('body', 'count'),
    total_word_count=('word_count', 'sum'),
    negative_message_count=('sentiment', lambda x: (x == 'Negative').sum())
).reset_index()

# Calculate final features
monthly_agg['average_word_count'] = monthly_agg['total_word_count'] / monthly_agg['message_count']
monthly_agg['negative_message_ratio'] = monthly_agg['negative_message_count'] / monthly_agg['message_count']
monthly_agg.fillna(0, inplace=True)

# --- Model Building ---
# Define features (X) and target (y)
features = ['message_count', 'average_word_count', 'negative_message_ratio']
X = monthly_agg[features]
y = monthly_agg['monthly_sentiment_score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# --- Model Evaluation ---
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n## Predictive Model Results ##")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

print("\nModel Coefficients (Feature Importance):")
for feature, coef in zip(features, model.coef_):
    print(f"- {feature}: {coef:.2f}")


## Predictive Model Results ##
Mean Squared Error (MSE): 2.34
R-squared (R2) Score: 0.53

Model Coefficients (Feature Importance):
- message_count: 0.23
- average_word_count: 0.01
- negative_message_ratio: -4.78


  monthly_agg = df.groupby(['from', pd.Grouper(key='date', freq='M')]).agg(
