# Dream Memory NLP Analysis

The NLP analysis on dream memories from the sythentic dataset will focus on:
1. Sentiment Analysis - Positivity/negativity of dream content
2. Emotional Tone - Emotion intensity and valence
3. Coherence Analysis - Logical flow between sentences



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load all CSVs (in real case, it will be multiple csv files too)
data_dir = Path('chaos_dataset')
csv_files = list(data_dir.glob('*.csv'))

for file in csv_files:
    print(f"  - {file.name}")

# Load and merge
all_data = []

for file in csv_files:
    df = pd.read_csv(file)
    all_data.append(df)

# Combine all dataframes
merged_data = pd.concat(all_data, ignore_index=True)
print(f"Total merged dataset: {len(merged_data)} rows")
print(f"Columns: {list(merged_data.columns)}")


  - argentina_dream_study_month1.csv
  - argentina_dream_study_month2.csv
  - argentina_dream_study_month3.csv
  - argentina_dream_study_month4.csv
  - argentina_dream_study_month5.csv
  - usa_dream_study_month2.csv
  - usa_dream_study_month3.csv
  - usa_dream_study_month1.csv
  - usa_dream_study_month4.csv
  - usa_dream_study_month5.csv
Total merged dataset: 2050 rows
Columns: ['ParticipantID', 'Timepoint', 'Country', 'StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress', 'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'InformedConsent', 'Demo_Age', 'Demo_Education', 'Demo_Student', 'Demo_Gender', 'GAD_Bothered_1', 'GAD_Bothered_2', 'GAD_Bothered_3', 'GAD_Bothered_4', 'GAD_Bothered_5', 'GAD_Bothered_6', 'GAD_Bothered_7', 'GAD_DifficultWork_1', 'PHQ_Bothered_1', 'PHQ_Bothered_2', 'PHQ_Bothered_3', 'PHQ_B

In [None]:
# DreamMemory extraction
# na checking
print(f"Total records: {len(merged_data)}")
print(f"Records with DreamMemory: {merged_data['DreamMemory'].notna().sum()}")
print(f"Missing DreamMemory: {merged_data['DreamMemory'].isna().sum()}")

# filter records with dream memories
dream_data = merged_data[merged_data['DreamMemory'].notna()].copy()
print(f"Working with {len(dream_data)} dream records")

# Show samples
print("Samples:")
print(dream_data['DreamMemory'].head(3))



Total records: 2050
Records with DreamMemory: 2050
Missing DreamMemory: 0
Working with 2050 dream records
Samples:
0    I was being chased by something threatening bu...
1    I was being chased by something threatening bu...
2    I was searching for something important but co...
Name: DreamMemory, dtype: object


## 1. Sentiment Analysis - Positivity/Negativity

We will analyze the emotional polarity of the dream content using multiple sentiment analysis approaches - VADER for basic tone and textBlob for double check. 


In [None]:
# packages for sentiment analysis
try:
    from textblob import TextBlob
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer
    print("Sentiment analysis libraries already available")
except ImportError:
    print("Installing required packages...")
    import subprocess
    import sys
    
    def install_package(package):
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    
    install_package("textblob")
    install_package("nltk")
    install_package("vaderSentiment")
    
    from textblob import TextBlob
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer
    
    # Download required NLTK data
    nltk.download('vader_lexicon', quiet=True)
    
    print("installed successfully!")


Installing required packages...
Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.3/624.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: textblob
Successfully installed textblob-0.19.0
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Packages installed successfully!


In [9]:
def analyze_sentiment_textblob(text):
    blob = TextBlob(str(text))
    return {
        'polarity': blob.sentiment.polarity,  # -1 (negative) to 1 (positive)
        'subjectivity': blob.sentiment.subjectivity  # 0 (objective) to 1 (subjective)
    }

def analyze_sentiment_vader(text):
    sia = SentimentIntensityAnalyzer()
    scores = sia.polarity_scores(str(text))
    return {
        'compound': scores['compound'],  # Overall sentiment
        'positive': scores['pos'],
        'negative': scores['neg'],
        'neutral': scores['neu']
    }

print("Performing ...")

# Apply sentiment analysis
sentiment_results = []

for idx, dream in enumerate(dream_data['DreamMemory']):
    if pd.notna(dream) and str(dream).strip():
        # TextBlob analysis
        tb_sentiment = analyze_sentiment_textblob(dream)
        
        # VADER analysis
        vader_sentiment = analyze_sentiment_vader(dream)
        
        sentiment_results.append({
            'index': idx,
            'dream_text': dream,
            'tb_polarity': tb_sentiment['polarity'],
            'tb_subjectivity': tb_sentiment['subjectivity'],
            'vader_compound': vader_sentiment['compound'],
            'vader_positive': vader_sentiment['positive'],
            'vader_negative': vader_sentiment['negative'],
            'vader_neutral': vader_sentiment['neutral']
        })
    
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(dream_data)} dreams...")

sentiment_df = pd.DataFrame(sentiment_results)
print(f"Sentiment analysis complete! Analyzed {len(sentiment_df)} dreams.")


Performing ...
Processed 100/2050 dreams...
Processed 200/2050 dreams...
Processed 300/2050 dreams...
Processed 400/2050 dreams...
Processed 500/2050 dreams...
Processed 600/2050 dreams...
Processed 700/2050 dreams...
Processed 800/2050 dreams...
Processed 900/2050 dreams...
Processed 1000/2050 dreams...
Processed 1100/2050 dreams...
Processed 1200/2050 dreams...
Processed 1300/2050 dreams...
Processed 1400/2050 dreams...
Processed 1500/2050 dreams...
Processed 1600/2050 dreams...
Processed 1700/2050 dreams...
Processed 1800/2050 dreams...
Processed 1900/2050 dreams...
Processed 2000/2050 dreams...
Sentiment analysis complete! Analyzed 2050 dreams.


In [11]:
# summary statistics
print("Sentiment Analysis Summary:")
print(f"Average TextBlob Polarity: {sentiment_df['tb_polarity'].mean():.3f}")
print(f"Average VADER Compound: {sentiment_df['vader_compound'].mean():.3f}")
print(f"Average Subjectivity: {sentiment_df['tb_subjectivity'].mean():.3f}")
print(f"Sentiment Categories:")
for category, count in category_counts.items():
    print(f"  {category}: {count} ({count/len(sentiment_df)*100:.1f}%)")

Sentiment Analysis Summary:
Average TextBlob Polarity: 0.033
Average VADER Compound: -0.319
Average Subjectivity: 0.399
Sentiment Categories:
  Negative: 1472 (71.8%)
  Neutral: 297 (14.5%)
  Positive: 281 (13.7%)
