In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
data = pd.read_csv("selected_data.csv")

In [5]:
data.head(2)

Unnamed: 0,message,Subject,From,To,body,user
0,Message-ID: <14286989.1075855666059.JavaMail.e...,Re: Enron Response to San Diego Request for Ga...,stephanie.miller@enron.com,jeff.dasovich@enron.com,Any merit to mentioning that there has been an...,allen-p
1,Message-ID: <1735118.1075855679365.JavaMail.ev...,Re: Enron Response to San Diego Request for Ga...,stephanie.miller@enron.com,jeff.dasovich@enron.com,Any merit to mentioning that there has been an...,allen-p


In [6]:
data.shape

(29026, 6)

In [7]:
#Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Text cleaning
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stop word removal and lemmatization
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(filtered_tokens)
    
    return preprocessed_text

# Apply the preprocessing function to the 'body' column
data['preprocessed_body'] = data['body'].apply(preprocess_text)


In [9]:
# Step 3: Split the data into training and testing datasets
X = data['preprocessed_body']
y = data['To']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 4: Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
# Step 5: Train the classification model (Naive Bayes)
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)


                           precision    recall  f1-score   support

  jeff.dasovich@enron.com       0.92      0.95      0.94       833
     pete.davis@enron.com       1.00      1.00      1.00      1769
sara.shackleton@enron.com       0.84      0.86      0.85      1047
     tana.jones@enron.com       0.88      0.85      0.87      1159
        vkaminski@aol.com       0.97      0.95      0.96       998

                 accuracy                           0.93      5806
                macro avg       0.92      0.92      0.92      5806
             weighted avg       0.93      0.93      0.93      5806



In [16]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))

Accuracy: 0.93
Precision: 0.93
Recall: 0.93
F1-score: 0.93


In [17]:
# Classification report
classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

                           precision    recall  f1-score   support

  jeff.dasovich@enron.com       0.92      0.95      0.94       833
     pete.davis@enron.com       1.00      1.00      1.00      1769
sara.shackleton@enron.com       0.84      0.86      0.85      1047
     tana.jones@enron.com       0.88      0.85      0.87      1159
        vkaminski@aol.com       0.97      0.95      0.96       998

                 accuracy                           0.93      5806
                macro avg       0.92      0.92      0.92      5806
             weighted avg       0.93      0.93      0.93      5806



In [35]:
# Access the 2nd 'body' and 'To' values
body_2 = data.loc[1, 'body']
recipient_2 = data.loc[1, 'To']

print("Body (2nd email):", body_200)
print("Recipient (2nd email):", recipient_200)

Body (2nd email): Any merit to mentioning that there has been an initial "supply" response in 
terms of pipeline infrastructure - open seasons/expansion efforts on behalf 
of Kern River, Transwestern and PGT (not yet announced)? 


From: Jeff Dasovich on 12/13/2000 12:04 PM
Sent by: Jeff Dasovich
To: Sarah Novosel/Corp/Enron@ENRON
cc: Christi L Nicolay/HOU/ECT@ECT, James D Steffes/NA/Enron@ENRON, Joe 
Hartsoe/Corp/Enron@ENRON, Mary Hain/HOU/ECT@ECT, pallen@enron.com, 
pkaufma@enron.com, Richard B Sanders/HOU/ECT@ECT, Richard 
Shapiro/NA/Enron@ENRON, Stephanie Miller/Corp/Enron@ENRON, Steven J 
Kean/NA/Enron@ENRON, Susan J Mara/NA/Enron@ENRON 

Subject: Re: Enron Response to San Diego Request for Gas Price Caps  

Recognizing the time constraints you face, I've tried to 1) clear up a few 
inaccuracies and 2) massage some of the sharper language without taking a 
chainsaw to the otherwise good job.  



Recipient (2nd email): jeff.dasovich@enron.com


In [36]:
# Preprocess the body of the 2nd email
preprocessed_body_2 = preprocess_text(body_2)

# Vectorize the preprocessed body using the TF-IDF vectorizer
body_2_tfidf = vectorizer.transform([preprocessed_body_2])

# Predict the recipient using the trained classifier
predicted_recipient_2 = clf.predict(body_2_tfidf)


In [37]:
print("Predicted Recipient (2nd email):", predicted_recipient_2)


Predicted Recipient (2nd email): ['jeff.dasovich@enron.com']


In [42]:
#  Preprocess the sample email
sample_email = ["I hope this email finds you well. I wanted to schedule a meeting to discuss the upcoming project. Please let me know your availability. Thank you."]  # Replace with your sample email
preprocessed_sample = [preprocess_text(email) for email in sample_email]
sample_tfidf = vectorizer.transform(preprocessed_sample)

# Step 2: Predict the recipient
predicted_recipient = clf.predict(sample_tfidf)

# Step 3: Print the predicted recipient
print("Predicted Recipient:", predicted_recipient)

Predicted Recipient: ['sara.shackleton@enron.com']


In [44]:
recipient = "sara.shackleton@enron.com"
num_emails = 5

# Filter the emails sent to the specified recipient
filtered_data = data[data['To'] == recipient]

# Get the bodies of the filtered emails
email_bodies = filtered_data['body'].tolist()

# Print the first 5 email bodies
for i, body in enumerate(email_bodies[:num_emails]):
    print("Email", i+1, "Body:")
    print(body)
    print("----------------------------------")


Email 1 Body:

Sara,

Please advise if I may be of assistance -- I believe this request requires a lawyers attention.

Susan 
 -----Original Message-----
From: 	Newbrough, Jennifer  
Sent:	Thursday, January 24, 2002 9:53 AM
To:	Bailey, Susan; Shackleton, Sara
Subject:	RE: Forest Products Trading Counterparties

I have attached a file which shows the information we would like to send to International Paper regarding counterparties.  It includes a description of the company (but not the name), the % of total tons traded and the credit rating of the counterparty.  Can you let me know if you think this information can be given to a third party under the confidentiality provisions mentioned below?  Please let me know.

Thanks,
Jennifer

 

 -----Original Message-----
From: 	Bailey, Susan  
Sent:	Friday, January 18, 2002 4:53 PM
To:	Newbrough, Jennifer
Cc:	Shackleton, Sara
Subject:	RE: Forest Products Trading Counterparties

Jennifer,

I have completed my review of the selected Forest Produc