In [15]:
import pandas as pd
data = pd.read_csv("/content/emails.CSV")
data.shape

(3107, 4)

In [13]:
data2 = pd.read_csv("/content/emails2.CSV")
data2.shape

(3913, 4)

In [14]:
data3 = pd.read_csv("/content/emails3.CSV")
data3.shape

(5274, 4)

In [16]:
# concatenate the two dataframes vertically
merged_df = pd.concat([data, data2, data3])

# count the number of duplicates
num_duplicates = merged_df.duplicated().sum()

# print the number of duplicates
print(f'There are {num_duplicates} duplicates in the merged dataframe.')

There are 3410 duplicates in the merged dataframe.


In [17]:
df = merged_df.drop_duplicates()
df.shape

(8884, 4)

In [18]:
for col in df.columns:
    num_na = df[col].isna().sum()
    print(f"Column '{col}' has {num_na} NaN values.")

Column 'Subject' has 61 NaN values.
Column 'Body' has 17 NaN values.
Column 'From: (Name)' has 0 NaN values.
Column 'To: (Name)' has 284 NaN values.


In [19]:
# dt = df[
#     (df["To: (Name)"] != "Saad Mourafik < 81199 >") &
#     (df["To: (Name)"] != "Achraf Ajrhourh < 115234 >") &
#     (df["To: (Name)"] != "Ouwais Zlaigi < 106931 >")
# ]
# dt.shape

(4193, 4)

In [20]:
!pip install bertopic[all] -q
!pip install flair -q

[0m

In [21]:
import pandas as pd
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [22]:
df['Merged'] = df['Subject'] + ' ' + df['Body']

In [23]:
for col in df.columns:
    num_na = df[col].isna().sum()
    print(f"Column '{col}' has {num_na} NaN values.")

Column 'Subject' has 61 NaN values.
Column 'Body' has 17 NaN values.
Column 'From: (Name)' has 0 NaN values.
Column 'To: (Name)' has 284 NaN values.
Column 'Merged' has 78 NaN values.


In [24]:
# drop the rows where 'Merged' is NaN
df = df.dropna(subset=['Merged'])

In [25]:
# save the dataframe to a CSV file
df.to_csv('emails_df.csv', index=False)

In [27]:
df[['Merged']].head()

Unnamed: 0,Merged
0,Recent Canvas notifications You're signed up t...
1,"Going Back to GMT: Sunday 19th Dear Community,..."
2,You’re on the GPT-4 API waitlist! <https://li...
3,Master project Hello Achraf\r\n\r\nLet's meet ...
4,"Assignment graded: Project#1, SP23-CSC535601 D..."


Preprocessing

In [30]:
df["Merged"] = df["Merged"].str.lower()
df.head()

Unnamed: 0,Subject,Body,From: (Name),To: (Name),Merged
0,Recent Canvas notifications,You're signed up to receive a daily report of ...,AUI Canvas,Achraf Ajrhourh < 115234 >,recent canvas notifications you're signed up t...
1,Going Back to GMT: Sunday 19th,"Dear Community, \r\n\r\nWe would like to remin...",SGA,,"going back to gmt: sunday 19th dear community,..."
2,You’re on the GPT-4 API waitlist!,<https://linkprotect.cudasvc.com/url?a=https%...,OpenAI,Achraf Ajrhourh < 115234 >,you’re on the gpt-4 api waitlist! <https://li...
3,Master project,Hello Achraf\r\n\r\nLet's meet some time next ...,Asmae Mourhir,Achraf Ajrhourh < 115234 >,master project hello achraf\r\n\r\nlet's meet ...
4,"Assignment graded: Project#1, SP23-CSC535601 D...",Your assignment Project#1 has been graded.\r\n...,SP23-CSC535601 Data Engineering and Visualization,Achraf Ajrhourh < 115234 >,"assignment graded: project#1, sp23-csc535601 d..."


In [31]:
import re

def remove_name_id(text):
    # define the regex pattern to match a name followed by an ID enclosed in angle brackets
    pattern = r'\s\<\s*\d+\s*\>\s*'
    # apply the pattern to the input text and replace the matches with an empty string
    text = re.sub(pattern, '', text)
    # return the modified text
    return text

In [35]:
df["txt"] = df["Merged"].apply(lambda text: remove_name_id(text))

In [36]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [37]:
df["txt"] = df["txt"].apply(lambda text: remove_urls(text))

In [44]:
data.drop(['Subject', 'Body', 'From: (Name)', 'To: (Name)'], axis=1, inplace=True)

In [46]:
import re

def remove_pattern(text):
    # define the regex pattern to match the specified pattern
    pattern = r'[a-zA-Z]{2}\d{2}-[a-zA-Z]{3}\d{6}'
    # apply the pattern to the input text and replace the matches with an empty string
    text = re.sub(pattern, '', text)
    # return the modified text
    return text

In [47]:
df["txt"] = df["txt"].apply(lambda text: remove_pattern(text))

In [49]:
import re

def remove_emails(text):
    """
    Removes any email addresses from the given text using regular expressions.
    
    Parameters:
        text (str): The text to remove emails from.
    
    Returns:
        str: The text with any email addresses removed.
    """
    # define a regular expression pattern to match email addresses
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    # replace any matches of the email pattern with an empty string
    return re.sub(email_pattern, '', text)

In [50]:
df["txt2"] = df["txt"].apply(lambda text: remove_emails(text))

In [58]:
import re

def remove_nonenglish_chars(text):
    """
    Removes any non-English characters from the given text string.
    
    Parameters:
        text (str): The text to remove non-English characters from.
    
    Returns:
        str: The text with any non-English characters removed.
    """
    # define a regular expression pattern to match non-English characters
    nonenglish_pattern = '[^a-zA-Z0-9_.,!?():;\'\"-]'
    
    # replace any non-English characters in the text with an empty string
    text = re.sub(nonenglish_pattern, ' ', text)
    
    return text


In [59]:
df["txt2"] = df["txt2"].apply(lambda text: remove_nonenglish_chars(text))

In [61]:
import re

def remove_emoji(text):
    # regular expression pattern to match emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    # remove emojis from the text
    return emoji_pattern.sub(r'', text)

In [62]:
df["txt2"] = df["txt2"].apply(lambda text: remove_emoji(text))

In [63]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [64]:
def remove_digits(text):
    """Remove digits from a string using regular expressions"""
    return re.sub(r'\d+', '', text)

In [65]:
df["txt2"] = df["txt2"].apply(lambda text: remove_punctuation(text))
df["txt2"] = df["txt2"].apply(lambda text: remove_digits(text))

In [67]:
def remove_short_text(text):
    """
    Removes any text that has less than 5 words.
    
    Parameters:
        text (str): The text to remove short text from.
    
    Returns:
        str: The text with any short text removed.
    """
    # split the text into words
    words = text.split()
    
    # if the number of words is less than 5, return an empty string
    if len(words) < 5:
        return ''
    
    # otherwise, return the original text
    else:
        return text

In [68]:
df["txt2"] = df["txt2"].apply(lambda text: remove_short_text(text))

In [71]:
import numpy as np
df.replace('', np.nan, inplace=True)
df.dropna(subset=['txt2'], inplace=True)
df.tail()

Unnamed: 0,Merged,txt,txt2
5268,"flyers competition dear students, \r\n\r\n ...","flyers competition dear students, \r\n\r\n ...",flyers competition dear students ...
5269,fw: winter intersession 2022 announcement dea...,fw: winter intersession 2022 announcement dea...,fw winter intersession announcement dear aui...
5271,"mass dolphin hunt over 1,400 dolphins, includi...","mass dolphin hunt over 1,400 dolphins, includi...",mass dolphin hunt over dolphins including pre...
5272,"sao this break dear all,\r\nwe hope you had a ...","sao this break dear all,\r\nwe hope you had a ...",sao this break dear all we hope you had a gre...
5273,#goodread📰 - how to build an outstanding cv ! ...,#goodread📰 - how to build an outstanding cv ! ...,goodread how to build an outstanding cv gr...


In [72]:
df.head()

Unnamed: 0,Merged,txt,txt2
0,recent canvas notifications you're signed up t...,recent canvas notifications you're signed up t...,recent canvas notifications youre signed up to...
1,"going back to gmt: sunday 19th dear community,...","going back to gmt: sunday 19th dear community,...",going back to gmt sunday th dear community ...
2,you’re on the gpt-4 api waitlist! <https://li...,you’re on the gpt-4 api waitlist! < \t\r\n\r\...,you re on the gpt api waitlist ...
3,master project hello achraf\r\n\r\nlet's meet ...,master project hello achraf\r\n\r\nlet's meet ...,master project hello achraf lets meet some ...
4,"assignment graded: project#1, sp23-csc535601 d...","assignment graded: project#1, data engineerin...",assignment graded project data engineering a...


In [91]:
email_stopwords = [    'subject', 'cc', 'bcc', 'from', 'to', 'sent', 'received', 'reply', 'forward', 'unsubscribe',    'unsubscribe', 'attachment', 'attachments', 'http', 'https', 'www', 'com', 'org', 'net', 'edu',    'html', 'php', 'jpg', 'jpeg', 'gif', 'png', 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',    'txt', 'zip', 'rar', 'gzip', 'tar', 'exe', 'dll', 'file', 'filename', 'mime', 'message', 'part',    'content', 'type', 'charset', 'encoding', 'transfer', 'date', 'time', 'zone', 'gmt', 'utc', 'est',    'pst', 'jst', 'dow', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'jan', 'feb', 'mar', 'apr',    'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'monday', 'tuesday', 'wednesday', 'thursday',    'friday', 'saturday', 'sunday', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',    'september', 'october', 'november', 'december', 'email', 'e-mail', 'mail', 'inbox', 'outbox', 'draft', 'spam',    'junk', 'folder', 'message', 'messages', 'sent', 'received', 'address', 'addresses', 'account', 'accounts',    'login', 'log', 'logout', 'password', 'user', 'username', 'domain', 'server', 'client', 'smtp', 'pop',    'imap', 'ssl', 'tls', 'authentication', 'forwarding', 'replying', 'image', 'signature', 'unsubscribe',    'subscription', 'subscribe', 'opt-out', 'opt-in', 'list', 'newsletter', 'notification', 'confirm', 'confirmation',    'request', 'support', 'help', 'faq', 'privacy', 'policy', 'terms', 'conditions', 'legal', 'copyright',    'trademark', 'fd', 'aui', 'dear', 'al', 'akhawayn', 'ifrane', 'dev', 'regard', "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz", 'ouwais', 'zlaigi', 'saad', 'mourafik', 'achraf', 'ajrhourh', 'student', 'university', 'microsoft', 'office', 'hope', 'mailto', 'link', 'outlook', 'day', 'today', 'asmae', 'mourhir', 'mohamed', 'sincerely', 'amine']

In [92]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk_stopwords = stopwords.words('english')

STOPWORDS = set(nltk_stopwords + email_stopwords)
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
len(STOPWORDS)

1320

In [76]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [77]:
lemmatizer = WordNetLemmatizer()

In [78]:
def postprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in STOPWORDS]
    return tokens

In [94]:
df["tokens"] = df["txt2"].apply(lambda text: postprocess_text(text))

In [81]:
# save the dataframe to a CSV file
df.to_csv('emails_df_processed.csv', index=False)

In [95]:
from collections import Counter

cnt = Counter()

for tokens in df['tokens']:
    for word in tokens:
        cnt[word] += 1
        
cnt.most_common(10)

[('team', 4326),
 ('student', 3864),
 ('session', 2749),
 ('assignment', 2589),
 ('update', 2379),
 ('data', 2162),
 ('view', 2088),
 ('event', 2083),
 ('learning', 2076),
 ('meeting', 2060)]

In [257]:
# def preprocessing_pipeline(text):
#     """Preprocessing pipeline for text"""
#     # Remove punctuation
#     text = remove_punctuation(text)
#     # Remove digits
#     text = remove_digits(text)
#     # Remove URLs
#     text = remove_urls(text)
#     # Remove emojis
#     text = remove_emoji(text)
#     # Remove stopwords
#     text = remove_stopwords(text)
#     # Remove frequent words
#     text = remove_freqwords(text)
#     # Remove rare words
#     text = remove_rarewords(text)
#     # Stem words
#     text = stem_words(text)
#     return text

Model Building

**Model 1**: ('bert-large-cased')

**Model 2**

In [113]:
# embedding model
bertembeddings = TransformerDocumentEmbeddings('bert-large-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [114]:
topic_model = BERTopic(language="english", top_n_words=10, low_memory=True ,calculate_probabilities=False,
                     embedding_model=bertembeddings)

In [115]:
emails = []
for tokens in df['tokens']:
    email = ' '.join(tokens)
    emails.append(email)

In [99]:
emails[0]

'canvas notification signed receive daily report notification canvas report submission comment project data engineering visualization tajjeeddine rachidi comment submission ajrhourhfor project click view update setting'

In [116]:
topics, probs = topic_model.fit_transform(emails)

In [123]:
#extract most frequent topics

topic_model.get_topic_freq().head(5)

Unnamed: 0,Topic,Count
0,-1,3031
1,0,224
2,1,211
3,2,157
4,3,124


In [124]:
#show the top 10 words in topic 1

topic_model.get_topic(1)[:10]

[('gear', 0.07477451654370222),
 ('unmonitored', 0.07337985831326178),
 ('mailbox', 0.07281459304041164),
 ('install', 0.07216032674625304),
 ('icon', 0.07166992887569314),
 ('teammate', 0.06886122156441853),
 ('team', 0.06456333985013181),
 ('reach', 0.05659632611650831),
 ('redmond', 0.05646101525558963),
 ('corporation', 0.04977611276688403)]

In [125]:
topic_model.get_topic(2)[:10]

[('requested', 0.18805210677488918),
 ('kudos', 0.14708199660305962),
 ('csc', 0.12899198630674752),
 ('session', 0.1210481447743321),
 ('topic', 0.1100846646941633),
 ('programming', 0.08515462221340483),
 ('computer', 0.055181932347709406),
 ('structure', 0.052107930695310084),
 ('database', 0.03521735617101047),
 ('chapter', 0.03374569027489176)]

In [126]:
topic_model.get_topic(3)[:10]

[('hw', 0.04355561136715887),
 ('defense', 0.03960313339529488),
 ('hour', 0.039190243362581825),
 ('invitation', 0.03756843460528749),
 ('thesis', 0.03704083458321599),
 ('success', 0.035548912955291695),
 ('confirming', 0.034441067407725255),
 ('priority', 0.031283660980888944),
 ('talk', 0.03087469781527071),
 ('midnight', 0.0280446639344141)]

In [127]:
topic_model.get_topic(4)[:10]

[('mongodb', 0.05026371221417474),
 ('roboflow', 0.0390376507628493),
 ('atlas', 0.02843413744050454),
 ('github', 0.026975245099400996),
 ('yolov', 0.015911133227119137),
 ('elastic', 0.01387378309354939),
 ('apache', 0.013152821019772196),
 ('cluster', 0.01280493208377043),
 ('search', 0.012660386357626518),
 ('copilot', 0.012476816557157317)]

In [128]:
topic_model.get_topic(5)[:10]

[('aid', 0.04375805096733026),
 ('ptj', 0.0428462533409401),
 ('telephone', 0.04194618015339901),
 ('registrar', 0.038377277385012926),
 ('financial', 0.03712019980536954),
 ('drop', 0.034195481700798704),
 ('nb', 0.029871257110107904),
 ('portal', 0.028841285347229045),
 ('officer', 0.027218603297152595),
 ('withdrawal', 0.02393926342242206)]

Evaluation

In [129]:
texts = [[word for word in str(document).split()] for document in emails]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

In [130]:
topics=[]
for i in topic_model.get_topics():
  row=[]
  topic= topic_model.get_topic(i)
  for word in topic:
     row.append(word[0])
  topics.append(row)

In [131]:
# compute Coherence Score

cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence = cm.get_coherence() 
print('\nCoherence Score: ', coherence)


Coherence Score:  0.14581923931650217


0.5 basic – good beginner level 1.0 good 2.0 very good 3.0+ excellent

In [132]:
# Save model
topic_model.save("my_model3")	

In [122]:
topic_model.visualize_topics()

**Model 2**

In [133]:
# embedding model
bertembeddings = TransformerDocumentEmbeddings('roberta-large')
topic_model = BERTopic(language="english", top_n_words=10, low_memory=True ,calculate_probabilities=False,
                     embedding_model=bertembeddings)
emails = []
for tokens in df['tokens']:
    email = ' '.join(tokens)
    emails.append(email)
topics, probs = topic_model.fit_transform(emails)
texts = [[word for word in str(document).split()] for document in emails]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]
topics=[]
for i in topic_model.get_topics():
  row=[]
  topic= topic_model.get_topic(i)
  for word in topic:
     row.append(word[0])
  topics.append(row)
# compute Coherence Score

cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence = cm.get_coherence() 
print('\nCoherence Score: ', coherence)
# Save model
topic_model.save("my_model4")	
topic_model.visualize_topics()

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]


Coherence Score:  0.1913481179522788


Model 3:

In [134]:
# embedding model
bertembeddings = TransformerDocumentEmbeddings('xlm-roberta-large')
topic_model = BERTopic(language="english", top_n_words=10, low_memory=True ,calculate_probabilities=False,
                     embedding_model=bertembeddings)
emails = []
for tokens in df['tokens']:
    email = ' '.join(tokens)
    emails.append(email)
topics, probs = topic_model.fit_transform(emails)
texts = [[word for word in str(document).split()] for document in emails]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]
topics=[]
for i in topic_model.get_topics():
  row=[]
  topic= topic_model.get_topic(i)
  for word in topic:
     row.append(word[0])
  topics.append(row)
# compute Coherence Score

cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence = cm.get_coherence() 
print('\nCoherence Score: ', coherence)
# Save model
topic_model.save("my_model5")	
topic_model.visualize_topics()

Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]


Coherence Score:  0.1037824234083942


LDA Model

In [304]:
data = pd.read_csv("/content/emails.CSV")
data.drop(['From: (Address)', 'To: (Address)', 'CC: (Name)', 'CC: (Address)'], axis=1, inplace=True)
dt = data[data["To: (Name)"]!="Saad Mourafik < 81199 >"]
dt['Merged'] = dt['Subject'] + ' ' + dt['Body']
dt = dt.dropna(subset=['Merged'])

In [297]:
dt[['Merged']].head()

Unnamed: 0,Merged
12,MPCTalk delayed by one hour \tDear AUI communi...
14,Fw: Trip to Istanbul: Hurry up! Hurry up! Few ...
16,Spring 2022 MBA Thesis Oral Defenses Announcem...
17,"CIP 1002 Session 02 Survey Dear Students, \r\n..."
21,Instructions for CIP 1002 Session 02 Reflectio...


In [305]:
documents2 = dt['Merged'].values

In [299]:
documents2[0]

'MPCTalk delayed by one hour \tDear AUI community, \r\n\r\n\t \r\n\r\n\tWe hope you and your loved ones are doing great. \r\n\r\n\t \r\n\r\n\tToday\'s MPCTalk will be delayed by one hour (starts at 3:00 PM in the Conference room, Building 2).\r\n\r\n\t\r\n\t\r\n\r\n\tThe topic of today\'s MPCTalk is:\r\n\t\r\n\r\n\t "Sino-Moroccan bilateral relations and The importance of cultural exchanges in strengthening this cooperation"" \r\n\t\r\n\r\n\twith our guest: \r\n\t\r\n\r\n\tHis excellency Mr. Li Changlin: Ambassador of the People\'s Republic of China to Morocco \r\n\r\n\t \r\n\r\n\tThe talk will revolve around two axes, where Mr Li Xhanglin will talk about the bilateral relations between the two countries in the first phase, then he will talk about the importance of cultural exchange between our countries in the second phase. \r\n\t\r\n\r\n\tEach axis will be followed by Q&A and a discussion. \r\n\t\r\n\r\n\t\r\n\t\r\n\r\n\tPolitically Yours, \r\n\r\n\tMPC President \r\n\r\n \r\n\r\nTod

In [300]:
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

In [301]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define custom stopwords
custom_stopwords = set(["http", "aui", "dear", 'u' , 'best', '2', 'email', 'going', 'dev', 'https', 'pm', 'university', '22', 'ifrane', 'make', 'community', 'student', "regard", 'program', "amal", '2f', '3a', '2fe', '2fc', '2fimages', '2fdlai', '3d', '2femails', 'openai', '1'])

# Update the existing stopword set
stop_words.update(custom_stopwords)


def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [306]:
for i, doc in enumerate(documents2):
    documents2[i] = preprocess_text(doc)

In [307]:
# Create a dictionary and a bag-of-words representation of your emails
dictionary = Dictionary(documents2)
corpus = [dictionary.doc2bow(email) for email in documents2]

In [309]:
# Train an LDA model
num_topics = 5  # Choose the number of topics you want
lda = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)

In [310]:
for i in range(num_topics):
    print(f"Topic {i}:")
    print([dictionary[word_id] for word_id, _ in lda.get_topic_terms(i)])

Topic 0:
['student', '2023', 'please', 'course', 'registration', 'regard', 'office', 'form', 'january', 'online']
Topic 1:
['student', 'please', 'de', '2022', 'housing', '2023', 'friday', 'workshop', 'office', 'room']
Topic 2:
['art', 'el', 'international', 'said', 'ennahid', 'icomos', 'member', 'architecture', 'islamic', 'committee']
Topic 3:
['event', 'club', 'u', 'student', 'please', 'join', 'sao', 'member', 'research', 'place']
Topic 4:
['saad', 'session', 'mourafik', 'please', 'confirmed', 'csc', 'student', '2022', 'cle', 'service']


In [288]:
coherence_model_lda = CoherenceModel(model=lda, texts=texts, dictionary=id2word, coherence='c_npmi')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -0.42093584027101827


Summarization

In [1]:
!pip install transformers -q
from transformers import pipeline

classifier = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [3]:
import pandas as pd
data = pd.read_csv("/content/emails.CSV")
data.drop(['From: (Address)', 'To: (Address)', 'CC: (Name)', 'CC: (Address)'], axis=1, inplace=True)
dt = data[data["To: (Name)"]!="Saad Mourafik < 81199 >"]

In [4]:
dt.head()

Unnamed: 0,Subject,Body,From: (Name),To: (Name)
12,MPCTalk delayed by one hour,"\tDear AUI community, \r\n\r\n\t \r\n\r\n\tWe ...",Moroccan Politics Club,auistudents;auistaff;auifaculty
14,Fw: Trip to Istanbul: Hurry up!,Hurry up! Few seats are available 🙂\r\n\r\n___...,Student Activities Office,SAOStudents;SAOStaff;SAOFaculty
16,Spring 2022 MBA Thesis Oral Defenses Announcement,"Dear AUI community, \r\n\r\n\r\nThe School of ...",Rachida Azelmad,MBA Students;SBA Students;auistudents;SBA Facu...
17,CIP 1002 Session 02 Survey,"Dear Students, \r\n\r\n \r\n\r\n I hope my e-m...",Achehboune Fatima Ezahra,Mehdi Alami Idrissi < 82626 >;Mohamed Aymane A...
21,Instructions for CIP 1002 Session 02 Reflectio...,"Dear Student,\r\n\r\nAs a requirement for CIP ...",Salaheddine Zekri,Mehdi Alami Idrissi < 82626 >;Mohamed Aymane A...


In [8]:
txt = dt['Body'].iloc[0]
txt

'\tDear AUI community, \r\n\r\n\t \r\n\r\n\tWe hope you and your loved ones are doing great. \r\n\r\n\t \r\n\r\n\tToday\'s MPCTalk will be delayed by one hour (starts at 3:00 PM in the Conference room, Building 2).\r\n\r\n\t\r\n\t\r\n\r\n\tThe topic of today\'s MPCTalk is:\r\n\t\r\n\r\n\t "Sino-Moroccan bilateral relations and The importance of cultural exchanges in strengthening this cooperation"" \r\n\t\r\n\r\n\twith our guest: \r\n\t\r\n\r\n\tHis excellency Mr. Li Changlin: Ambassador of the People\'s Republic of China to Morocco \r\n\r\n\t \r\n\r\n\tThe talk will revolve around two axes, where Mr Li Xhanglin will talk about the bilateral relations between the two countries in the first phase, then he will talk about the importance of cultural exchange between our countries in the second phase. \r\n\t\r\n\r\n\tEach axis will be followed by Q&A and a discussion. \r\n\t\r\n\r\n\t\r\n\t\r\n\r\n\tPolitically Yours, \r\n\r\n\tMPC President \r\n\r\n \r\n\r\nToday youth, Tomorrow politicia

In [9]:
classifier(txt)

[{'summary_text': ' MPCTalk will be delayed by one hour (starts at 3:00 PM in the Conference room, Building 2) The talk will revolve around two axes, where Mr Li Xhanglin will talk about the bilateral relations between the two countries in the first phase . The second phase will be followed by Q&A and discussion .'}]