# Introduction

This notebook includes code for preprocessing text-based journal data, focusing on dates and entries. It applies topic modeling using popular methods such as LDA, NMF, and LSA from the scikit-learn library. Additionally, sentiment analysis is performed using the VaderSentiment package. The resulting output file contains sentiment categories and scores, which will serve as input for the mental health analysis notebook.

# Libraries

In [3]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import numpy as np
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline
# from bertopic import BERTopic

nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

# Loading Data from a Text File

In [4]:
# Function to load data from a .txt file
def load_txt_file(file_path):
    """
    Load text data from a .txt file.
    
    Parameters:
    file_path (str): Path to the .txt file
    
    Returns:
    str: Contents of the file as a single string
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Specify the file path. Replace with personal journal text data.
file_path = '2024_Journal.txt'  
# Load the data
text_data = load_txt_file(file_path)

# Check the imported data
if text_data:
    print (" Text data is successfully uploaded!")


 Text data is successfully uploaded!


# Checking for Available Dates in Text Data

In [5]:
def extract_dates_from_file(file_path):
    # Define a regular expression pattern to match dates in various formats used in the journal.
    date_pattern = r"""
        (\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{1,2},\s2024\b)|  # Matches Jan 9, 2024
        (\b\d{1,2}(?:st|nd|rd|th)?\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\b)  # Matches 25th November
    """

    # Compile the regular expression with multiline and ignore whitespace flags
    date_regex = re.compile(date_pattern, re.VERBOSE | re.IGNORECASE)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:  # Specify UTF-8 encoding
            content = file.read()

        # Find all matches in the file content
        matches = date_regex.findall(content)

        # Flatten the matches and remove empty strings
        dates = [date for group in matches for date in group if date]

        return dates, len(dates)

    except FileNotFoundError:
        print("The file was not found. Please check the file path.")
        return [], 0
    except UnicodeDecodeError as e:
        print(f"Unicode decode error: {e}. Try specifying the correct file encoding.")
        return [], 0

# Example usage
file_path = "2024_Journal.txt"  # Replace with your text file path
dates, count = extract_dates_from_file(file_path)
print(f"Dates found: {dates}")
print(f"Number of dates: {count}")


Dates found: ['25th November', '26th November', '24th November', '27th November', 'Nov 28, 2024', 'Nov 30, 2024', 'Dec 1, 2024', 'Dec 2, 2024', 'Dec 3, 2024', 'Dec 4, 2024', 'Dec 5, 2024', 'Dec 6, 2024', 'Dec 7, 2024', 'Dec 8, 2024', 'Dec 9, 2024', 'Dec 10, 2024', 'Dec 11, 2024', 'Dec 12, 2024', 'Dec 13, 2024', 'Dec 14, 2024', 'Dec 15, 2024', 'Dec 16, 2024', 'Nov 24, 2024', 'Nov 21, 2024', 'Nov 18, 2024', 'Nov 12, 2024', '10th November', 'Nov 8, 2024', 'Nov 7, 2024', '16th September', 'Oct 8, 2024', 'Sep 30, 2024', 'Sep 28, 2024', 'Sep 27, 2024', 'Sep 18, 2024', 'Sep 17, 2024', 'Sep 16, 2024', 'Sep 9, 2024', 'Aug 9, 2024', 'Aug 8, 2024', '29th July', 'Jul 29, 2024', 'Jul 26, 2024', 'Jul 23, 2024', 'Jul 19, 2024', 'Jul 16, 2024', 'Jul 11, 2024', 'Jul 9, 2024', 'Jul 4, 2024', 'Jun 26, 2024', '24th August', 'Jun 18, 2024', 'May 29, 2024', 'May 21, 2024', 'May 10, 2024', 'May 1, 2024', 'Apr 19, 2024', 'Mar 26, 2024', 'Mar 19, 2024', 'Mar 15, 2024', 'Mar 13, 2024', 'Mar 12, 2024', 'Mar 9, 2

# Topic Modelling

In [6]:
# Load and preprocess the text file
def load_documents(file_path, delimiter='\n\n'):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        # Split the content into documents
        documents = content.split(delimiter)
        return [doc.strip() for doc in documents if doc.strip()]
    except FileNotFoundError:
        print("File not found. Check the file path.")
        return []

def preprocess_text(documents):
    stop_words = set(stopwords.words('english'))  # Load stop words
    processed_docs = []
    for doc in documents:
        # Lowercase, remove special characters, and strip whitespace
        doc = doc.lower()
        doc = re.sub(r'[^a-z\s]', '', doc)
        doc = re.sub(r'\s+', ' ', doc).strip()
        
        # Remove stop words
        doc = ' '.join(word for word in doc.split() if word not in stop_words)
        
        processed_docs.append(doc)
    return processed_docs

# Topic Modeling using LDA
def lda_topic_modeling(documents, n_topics=5):
    vectorizer = CountVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(documents)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(dtm)
    topics = []
    for idx, topic in enumerate(lda.components_):
        top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
        topics.append(f"Topic {idx + 1}: {', '.join(top_words)}")
    return topics

# Topic Modeling using NMF
def nmf_topic_modeling(documents, n_topics=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(documents)
    nmf = NMF(n_components=n_topics, random_state=42)
    nmf.fit(dtm)
    topics = []
    for idx, topic in enumerate(nmf.components_):
        top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
        topics.append(f"Topic {idx + 1}: {', '.join(top_words)}")
    return topics

# Topic Modeling using LSA
def lsa_topic_modeling(documents, n_topics=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(documents)
    lsa = TruncatedSVD(n_components=n_topics, random_state=42)
    lsa.fit(dtm)
    topics = []
    for idx, topic in enumerate(lsa.components_):
        top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
        topics.append(f"Topic {idx + 1}: {', '.join(top_words)}")
    return topics


# Main function to test the techniques

file_path = "2024_Journal.txt"  # Replace with your text file path
delimiter = '\n\n'  # Define how documents are separated in your file

# Load and preprocess documents
documents = load_documents(file_path, delimiter)


processed_docs = preprocess_text(documents)

print("\nLDA Topics:")
lda_topics = lda_topic_modeling(processed_docs)
for topic in lda_topics:
    print(topic)

print("\nNMF Topics:")
nmf_topics = nmf_topic_modeling(processed_docs)
for topic in nmf_topics:
    print(topic)

print("\nLSA Topics:")
lsa_topics = lsa_topic_modeling(processed_docs)
for topic in lsa_topics:
    print(topic)


LDA Topics:
Topic 1: stuff, like, really, chapter, days, work, going, need, life, time
Topic 2: need, know, time, want, lord, god, feel, life, like, phone
Topic 3: need, jul, nice, lord, far, future, thank, god, going, trend
Topic 4: really, past, feel, god, days, like, going, time, life, need
Topic 5: want, know, money, life, time, need, dec, went, like, day

NMF Topics:
Topic 1: stuff, think, position, phd, data, things, going, time, need, work




Topic 2: believe, shall, live, pray, thinking, thank, chapter, lord, god, life
Topic 3: school, interesting, going, today, enjoyed, hoped, super, far, jul, trend
Topic 4: journal, lot, wasting, long, past, wasted, time, nov, month, days
Topic 5: patience, know, let, gilbert, people, dont, lonely, need, like, feel

LSA Topics:
Topic 1: know, days, lord, feel, god, like, going, need, time, life
Topic 2: far, pray, oh, live, thinking, shall, thank, god, lord, life
Topic 3: future, thank, school, super, hoped, today, going, far, jul, trend
Topic 4: screen, watching, th, wasting, wasted, cat, month, life, nov, days
Topic 5: meeting, days, enjoyed, like, feel, jul, interesting, people, trend, super


# Put Data into a Table

In [7]:
# Function to extract date and text from the file
def extract_data_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Regular expression to match dates (e.g., Nov 25, 2024 or Dec 13, 2024)
    date_pattern = r'(\b[A-Za-z]{3,9} \d{1,2}, \d{4}\b)'  # e.g., "Nov 25, 2024"
    matches = re.split(date_pattern, content)

    # Process matches to extract date and text pairs
    data = []
    for i in range(1, len(matches), 2):  # Start at index 1 for dates
        date = matches[i].strip()
        text = matches[i + 1].strip() if i + 1 < len(matches) else ''
        data.append((date, text))
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["Date", "Text"])
    return df


# Extract data and store in a DataFrame
data_table = extract_data_from_txt(file_path)


# Save the table to a CSV file
data_table.to_csv('output_table.csv', index=False)


# Sentiment Analysis

In [8]:
output_data = pd.read_csv('output_table.csv')

In [9]:
output_data

Unnamed: 0,Date,Text
0,"Nov 28, 2024",I don’t remember what I did on this day. Wedne...
1,"Nov 30, 2024",I went out to meet Bazil in the evening. I spe...
2,"Dec 1, 2024",I did not go to church because of the rain. Th...
3,"Dec 2, 2024",I remember going to school to meet Glen. I bou...
4,"Dec 3, 2024","Stayed at home the whole day, bought fried chi..."
...,...,...
65,"Jan 26, 2024",Okay so we are coming to the end of January an...
66,"Jan 17, 2024",October 2023 to January 2024 Chat GPT Summary\...
67,"Jan 11, 2024",I am still procrastinating Howdy’s assignment ...
68,"Jan 9, 2024","This morning, I had a major relapse. But it ca..."


In [11]:
# Load and preprocess data
def load_data(file_path):
    return pd.read_csv(file_path)

# Lexicon-Based Sentiment Analysis (VADER)
def vader_sentiment_analysis(dataframe):
    analyzer = SentimentIntensityAnalyzer()
    sentiments = []
    scores = []

    for text in dataframe["Text"]:
        score = analyzer.polarity_scores(text)
        sentiment = (
            "positive" if score["compound"] > 0.05
            else "negative" if score["compound"] < -0.05
            else "neutral"
        )
        scores.append(score["compound"])
        sentiments.append(sentiment)
    
    dataframe["Sentiment Score"] = scores
    dataframe["Sentiment"] = sentiments
    return dataframe

In [12]:
file_path = "output_table.csv"  # Path to the CSV file
data = load_data(file_path)
analyzed_data = vader_sentiment_analysis(data)
    
# Save the updated data to a new CSV file
analyzed_data.to_csv("output_with_sentiments.csv", index=False)
print("Sentiment analysis completed. Results saved to 'output_with_sentiments.csv'.")


Sentiment analysis completed. Results saved to 'output_with_sentiments.csv'.
