<a href="https://colab.research.google.com/github/NollyKeyz/Simple-Python-Projects/blob/main/properly_cleaning_and_formating_an_email.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This is a simple text classification code for email data which has been properly formatted as a csv file. I have also provided a code to properly parse an email

# Each of the lines of codes are properly explained as comments where neccessary

In [None]:
# importing important libraries

import tensorflow as tf
from tensorflow import keras
import pandas as pd

# **Be sure to replace the 'your_filename' with the appropriate csv filename**

In [None]:
# read the csv file into a pandas dataframe

email_document = pd.read_csv('your_filename.csv')

# Since this is an email data, it would most likely have a subject column. You can adjust to suit what you intend to see

In [None]:
# obtain more information about the email document

email_document['Subject']

In [None]:
# Get an understanding of the data structure

email_document.head()

In [None]:
# The shape typically shows 'rows x column dimension of your data

print(email_document.shape)

#We'll need to carryout some preprocessing activities on our data

## **Preprocessing**

In [None]:
!pip install contractions

import contractions # useful for single words containing certain characters
import re # for implementing some of the preprocessing steps
from string import punctuation # the punctuation is used to escape punctuation

def clean_text(text):
    # make text lowercase
    text = str(text).lower()
    #  remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # remove text in square brackets
    text = re.sub('\[.*?\]', ' ', text)
    # expand contractions especially for words with apostrophe
    text = " ".join([contractions.fix(expanded_word) for expanded_word in text.split()])
    # remove links
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    # remove new lines
    text = re.sub('\n', ' ', text)
    # remove words containing numbers
    text = re.sub('\w*\d\w*', ' ', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(punctuation), ' ', text)
    return text

In [None]:
# apply clean text fuction on each email in the training dataset
email_document['Clean_Body'] = email_document['Body'].apply(lambda x:clean_text(x))
email_document['Clean_Subject'] = email_document['Subject'].apply(lambda x:clean_text(x))

email_document.head()

## The natural language toolkit (NLTK) is useful for working with human text data

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# calculate the number of sentences for each email body and
email_document['no_body_sentences'] = email_document['Clean_Body'].apply(lambda x: len(sent_tokenize(x)))
email_document['no__subject_sentences'] = email_document['Clean_Subject'].apply(lambda x: len(sent_tokenize(x)))

email_document.head()

In [None]:
from nltk.tokenize import word_tokenize


# tokenize each of the email into words
email_document['email_body_words'] = email_document['Clean_Body'].apply(lambda x:word_tokenize(str(x)))
email_document['subject_words'] = email_document['Clean_Subject'].apply(lambda x:word_tokenize(str(x)))

email_document.head()

In [None]:
from collections import Counter # used for counting the frequency of words appearance

top = Counter([item for sublist in email_document['email_body_words'] for item in sublist])
temp_df = pd.DataFrame(top.most_common(40))
temp_df.columns = ['Common_words','count']
temp_df.style.background_gradient(cmap = 'Blues')

# Stop Words removal

## Words such as 'and', 'the' etc which are not so relevant are removed for improved results

In [None]:
import nltk
from nltk.corpus import stopwords # i imported this to remove stopwords

nltk.download('stopwords')

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['utf', 'b', 'q']) # replace the words with recurring unuseful words based on how your data is structured

def remove_stopwords(texts):
  return [word for word in texts if word not in stop_words]

email_document['email_body_without_sw'] = email_document['email_body_words'].apply(lambda x:remove_stopwords(x))
email_document['subject_without_sw'] = email_document['subject_words'].apply(lambda x:remove_stopwords(x))
email_document.head()

#Tweak your own code depending on the results you intend to achieve with it

## I am particularly interested in the email body and subject for the nature of my classification task

In [None]:
# print the top ten most common words in the body of the email

top = Counter([item for sublist in email_document['email_body_without_sw'] for item in sublist])
temp_df = pd.DataFrame(top.most_common(40))
temp_df.columns = ['Common_words','count']
temp_df.style.background_gradient(cmap = 'Greens')

In [None]:
# print the top ten most common words in the subject of the email

top = Counter([item for sublist in email_document['subject_without_sw'] for item in sublist])
temp_df = pd.DataFrame(top.most_common(40))
temp_df.columns = ['Common_words','count']
temp_df.style.background_gradient(cmap = 'Reds')

# Function definition is necessary when dealing with some tasks.

## In my case, I am dealing with four classes in relation to the kind of emails I received (social, religious/education, finance and then others which do not fall into any of the aforementioned category. Yours could be different

In [None]:
def class_creation(subjects): # define a function for class creation

  # Keywords for identifying different categories
  social_keywords = ['facebook', 'email', 'emails', 'linkedin', 'post', 'twitter', 'commented', 'posts', 'page', 'invitation', 'invitations', 'posted', ]
  religious_plus_education_keywords = ['proverb', 'school']
  finance_keywords = ['transaction', 'gens', 'otp']

  # Initialize list to store labelled subjects
  result = []

  for subject in subjects:
    if any(keyword in subject for keyword in social_keywords):
      result.append('social')
    elif any(keyword in subject for keyword in religious_plus_education_keywords):
      result.append('religious and education')
    elif any(keyword in subject for keyword in finance_keywords):
      result.append('finance')
    else:
      result.append('other updates')
  return result


email_document['class_labels'] = class_creation(email_document['subject_without_sw'])
email_document.head()

In [None]:
def access_num_label(string_label): # a function to assign numerical labels to each of the classes
  labels = []
  for label in string_label:
    if label == 'social':
      labels.append(int(0))
    elif label == 'finance':
      labels.append(int(1))
    elif label == 'religious and education':
      labels.append(int(2))
    elif label == 'other updates':
      labels.append(int(3))
  return labels

email_document['label'] = access_num_label(email_document['class_labels'])
email_document.head()

In [None]:
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
email_document['lemmatized_body'] = email_document['email_body_without_sw'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
email_document['lemmatized_subject'] = email_document['subject_without_sw'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
email_document.head()

In [None]:
email_document['final_body'] = email_document['lemmatized_body'].apply(lambda x:' '.join(x))
email_document['final_subject'] = email_document['lemmatized_subject'].apply(lambda x:' '.join(x))
email_document.head()

# I hope this helps

# T for Thanks