# Part 1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import normalize 

from ml_helpers import * 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Loading data

In [2]:
df = pd.read_csv('ml_dataset.csv', delimiter=';')
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'giskard_dataset.csv'

# Lets create a new frame with the data we need.

In [None]:
email_df = pd.DataFrame(parse_into_emails(df.Message))
email_df

# Drop emails with empty body, to or from_ columns

In [None]:
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)
email_df

# we are going to tokenize the bodies and convert them into a document-term matrix.

In [None]:
stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.4, min_df=0.2)

X = vect.fit_transform(email_df.body)
features = vect.get_feature_names()
len(features)

# Let's print the top 10 terms in document 1

In [None]:
print(top_feats_in_doc(X, features, 1, 10))

# Now we print the top terms across all documents.

In [None]:
print(top_mean_feats(X, features, None, 0.1, 10))

# As clustering algorithm KMeams is a perfect fit

In [None]:
n_clusters = 3
clf = KMeans(n_clusters=n_clusters, 
            max_iter=100, 
            init='k-means++', 
            n_init=1)
labels = clf.fit_predict(X)

#For larger datasets use mini-batch KMeans, so we dont have to read all data into memory.
# batch_size = 500
# clf = MiniBatchKMeans(n_clusters=n_clusters, init_size=1000, batch_size=batch_size, max_iter=100)  
# clf.fit(X)

# Let's plot this with matplotlib to visualize it

In [None]:
# First we need to make 2D coordinates from the sparse matrix.
X_dense = X.todense()
pca = PCA(n_components=2).fit(X_dense)
coords = pca.transform(X_dense)

plt.scatter(coords[:, 0], coords[:, 1], c='m')
plt.show()

# Lets plot it again, but this time we add some color to it.

In [None]:
label_colors = ["#2AB0E9", "#2BAF74", "#D7665E", "#CCCCCC", 
                "#D2CA0D", "#522A64", "#A3DB05", "#FC6514"]
colors = [label_colors[i] for i in labels]

plt.scatter(coords[:, 0], coords[:, 1], c=colors)
plt.show()

# Cluster Centers

In [None]:
centroids = clf.cluster_centers_
centroid_coords = pca.transform(centroids)
plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker='X', s=200, linewidths=2, c='#444d60')
plt.show()

# Use this to print the top terms per cluster with matplotlib.

In [None]:
plot_tfidf_classfeats_h(top_feats_per_cluster(X, labels, features, 0.1, 25))

# Part 2

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd

from ml_helpers import parse_into_emails
from ml_query import EmailDataset

# Just like in part_1, read and preprocess emails

In [None]:
emails = pd.read_csv('giskard_dataset.csv', delimiter=';') 
email_df = pd.DataFrame(parse_into_emails(emails.Message))
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)

stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
vec_train = vec.fit_transform(email_df.body)

# print out the vector of the first email

In [None]:

print(vec_train[0:1])

# Find cosine similarity between the first email and all others.

In [None]:
cosine_sim = linear_kernel(vec_train[0:1], vec_train).flatten()

# print out the cosine similarities
print(cosine_sim)

In [None]:
# Finding emails related to a query.
query = "john"

# Transform the query into the original vector
vec_query = vec.transform([query])

cosine_sim = linear_kernel(vec_query, vec_train).flatten()

In [None]:
# Find top 10 most related emails to the query.
related_email_indices = cosine_sim.argsort()[:-10:-1]
# print out the indices of the 10 most related emails.
print(related_email_indices)

# print out the first email 
first_email_index = related_email_indices[0]
print(email_df.body[first_email_index])