In [None]:
import pandas as pd

# Load the email dataset 
data = pd.read_csv('emails(2).csv')  

print(data.head())
print(data.describe())


In [None]:
#Data cleaning
data = data.drop_duplicates()


data = data.dropna()

# Write a function to clean text data
import re

def clean_text(text):
    #Removing HTML tags and non-alphanumeric characters
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'[^\w\s]', '', text)  
    return text


data['cleaned_text'] = data['text'].apply(clean_text
print("Raw text vs Cleaned text")
for i in range(5):
    print("Raw:", data['text'][i])
    print("Cleaned:", data['cleaned_text'][i])


In [None]:
# Remove emojis
def remove_emojis(text):
    # Remove emojis using regex
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the function to the email body
data['cleaned_text'] = data['cleaned_text'].apply(remove_emojis)


In [None]:
# Stemming using NLTK
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Lemmatization using NLTK
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to apply stemming and lemmatization
def stem_and_lemmatize(text):
    stemmed_text = ' '.join([stemmer.stem(word) for word in text.split()])
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return stemmed_text, lemmatized_text

# Apply stemming and lemmatization to the email body
data['stemmed_email'], data['lemmatized_email'] = zip(*data['cleaned_text'].map(stem_and_lemmatize))


In [None]:
# Remove stop words manually
stop_words_list = ['the', 'of', 'to', 'is','am','are']  #list of stop words

data['no_stop_words_email'] = data['lemmatized_email'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words_list]))

# Remove stop words using NLTK
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data['no_stop_words_email_nltk'] = data['lemmatized_email'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


In [None]:
# Tokenize the text into words
data['tokenized_text'] = data['no_stop_words_email'].apply(lambda x: x.split())


In [None]:
#Calculate TF-IDF vectors for the text
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf(text):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(text)
    return tfidf

# Apply the function to the email body
data['tfidf'] = calculate_tfidf(data['no_stop_words_email'])


In [None]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
count_vectorizer = CountVectorizer()

# Apply BoW representation
bow_representation = count_vectorizer.fit_transform(data['no_stop_words_email'])

# Compare TF-IDF and BoW representations for a specific email
email_index = 10

# TF-IDF representation
print("TF-IDF representation:")
print(data['tfidf'][email_index])

# BoW representation
print("Bag of Words (BoW) representation:")
print(bow_representation[email_index])
