In [74]:
# Importing the pandas library for data manipulation and analysis.
import pandas as pd

In [75]:
# Loading the email data from a tab-separated text file into a pandas DataFrame.
# The file 'email.txt' contains email information. Columns are specified for better readability.
email_data = pd.read_csv('email.txt', sep='\t', names=['temp', 'email_body'])

In [76]:
# Displaying the first few rows of the loaded DataFrame to understand its structure and data.
email_data.head()

Unnamed: 0,temp,email_body
0,No,I look forward to meeting you and learning abo...
1,No,We look forward to seeing you next week!
2,No,A quick question before our meeting.
3,No,After sunning and drinking all day we feasted ...
4,No,"Also Tuesday, Kyle and Eric 1/2 hour."


In [77]:
# Dropping the 'temp' column from the DataFrame as it is not needed for further analysis.
email_data.drop('temp', axis = 1, inplace = True)

In [95]:
# Importing the pipeline module from the transformers library to use a pre-trained sentiment analysis model.
# The model used is 'cardiffnlp/twitter-roberta-base-sentiment', optimized for sentiment detection.
from transformers import pipeline

sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", framework = 'pt')

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [96]:
# Applying the sentiment analysis pipeline to each email body in the DataFrame.
# The sentiment label (e.g., positive, neutral, negative) is extracted for each email.
email_data['sentiment'] = email_data['email_body'].apply(lambda x: sentiment_analyzer(x)[0]['label'])


In [97]:
# Displaying the first few rows of the loaded DataFrame to understand its structure and data.
email_data.head()

Unnamed: 0,email_body,sentiment
0,I look forward to meeting you and learning abo...,LABEL_2
1,We look forward to seeing you next week!,LABEL_2
2,A quick question before our meeting.,LABEL_1
3,After sunning and drinking all day we feasted ...,LABEL_2
4,"Also Tuesday, Kyle and Eric 1/2 hour.",LABEL_1


In [98]:
# Mapping the sentiment labels from the model's format (LABEL_0, LABEL_1, LABEL_2) to more readable labels.
labels = {"LABEL_0": 'negative', "LABEL_1": 'neutral', "LABEL_2": 'positive'}
email_data['sentiment'] = email_data['sentiment'].map(labels)

In [99]:
# Displaying the first few rows of the loaded DataFrame to understand its structure and data.
email_data.head()

Unnamed: 0,email_body,sentiment
0,I look forward to meeting you and learning abo...,positive
1,We look forward to seeing you next week!,positive
2,A quick question before our meeting.,neutral
3,After sunning and drinking all day we feasted ...,positive
4,"Also Tuesday, Kyle and Eric 1/2 hour.",neutral


In [None]:
# Defining a function to detect the formality of a given text using the Groq API.
# The API call checks if the text is 'formal' or 'informal' based on the model's output.

from groq import Groq

def detect_formality(text):
    api_key = os.getenv("GROQ_API_KEY")
    client = Groq(api_key=api_key)
    completion = client.chat.completions.create(
        model="llama-3.1-70b-versatile",
        messages=[
            {
                "role": "user",
                "content": f"tell if this text is formal or not in one word '{text}'"
                },
            {
                "role": "assistant",
                }
            ],
        temperature=0.2,
        max_tokens=100,
        top_p=1,
        stream=True,
        stop=None,
        )
    res = ""
    for chunk in completion:
        res += chunk.choices[0].delta.content or ""
    return res

In [118]:
# Defining a function to detect the formality of a given text using the Groq API.
# The API call checks if the text is 'formal' or 'informal' based on the model's output.
email_data['formality'] = email_data['email_body'].apply(detect_formality)

In [119]:
# Displaying the first few rows of the loaded DataFrame to understand its structure and data.
email_data.head()

Unnamed: 0,email_body,sentiment,formality
0,I look forward to meeting you and learning abo...,positive,Formal.
1,We look forward to seeing you next week!,positive,Formal
2,A quick question before our meeting.,neutral,Informal.
3,After sunning and drinking all day we feasted ...,positive,Informal.
4,"Also Tuesday, Kyle and Eric 1/2 hour.",neutral,Informal


In [120]:
# Counting the occurrences of each formality type (formal and informal) in the dataset.
email_data.formality.value_counts()

formality
Informal     371
Formal.      281
Informal.    231
Formal       109
Name: count, dtype: int64

In [124]:
email_data['formality'].replace(['Formal.', 'Formal'], 'formal', inplace = True)
email_data['formality'].replace(['Informal.', 'Informal'], 'informal', inplace = True)


In [125]:
# Counting the occurrences of each formality type (formal and informal) in the dataset.
email_data.formality.value_counts()

formality
informal    602
formal      390
Name: count, dtype: int64

In [130]:
email_data.to_csv('email_data_labels.csv')

In [131]:
fine_tune_data = pd.read_csv('email_data_labels_after_correction.csv')

In [135]:
# Creating a combined output column that merges sentiment and formality for fine-tuning preparation.
fine_tune_data['output'] = fine_tune_data['sentiment']+" "+fine_tune_data['formality']

In [138]:
fine_tune_data.drop(['Unnamed: 0', 'sentiment', 'formality'], axis=1,inplace=True)

In [140]:
fine_tune_data = fine_tune_data.rename(columns={'email_body':'input'})

In [141]:
fine_tune_data.head()

Unnamed: 0,input,output
0,I look forward to meeting you and learning abo...,positive formal
1,We look forward to seeing you next week!,positive formal
2,A quick question before our meeting.,neutral informal
3,After sunning and drinking all day we feasted ...,positive informal
4,"Also Tuesday, Kyle and Eric 1/2 hour.",neutral informal


In [None]:
# Saving the processed DataFrame to a CSV file for further use in fine-tuning a model.
fine_tune_data.to_csv('fine_tune_data.csv')