<a href="https://colab.research.google.com/github/OfriA/AppliedProject52056/blob/main/data/DataExploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Loading the data
url = "https://github.com/OfriA/AppliedProject52056/raw/refs/heads/main/data/ER_data.xlsx"
data = pd.read_excel(url)

data.head()

In [None]:
print(data.columns)

In [None]:
data[['Progress', 'Duration (in seconds)', 'sex', 'age', 'ed_level', 'ses', 'DAS_1', 'custody', 'n_children', 'child_age', 'child_gender', 'moved_Y/N', 'therapy', 'psych_drugs', 'health', 'partner_duty', 'alarms_freq', ]].describe()

In [None]:
data.age.hist(bins=20)

In [None]:
data.n_children.hist()

In [None]:
import seaborn as sns

In [None]:
descriptive_corr = data[['sex', 'age', 'ed_level', 'ses', 'DAS_1', 'custody', 'n_children', 'child_age', 'child_gender', 'moved_Y/N', 'therapy', 'psych_drugs', 'health', 'partner_duty', 'alarms_freq', ]].corr()

# Create the heatmap
plt.figure(figsize=(8, 6)) # Adjust figure size as needed
sns.heatmap(descriptive_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Descriptive Statistics - Correlation Heatmap')
plt.show()

In [None]:
# War exposure features
self_exposure_features = ['SelfExposure_1', 'SelfExposure_2', 'SelfExposure_3', 'SelfExposure_4', 'SelfExposure_5', 'SelfExposure_6']
other_exposure_features = ['OtherExposure_6', 'OtherExposure_7', 'OtherExposure_8', 'OtherExposure_9', 'OtherExposure_10', 'OtherExposure_11', 'OtherExposure_12']

# Commpute war exposure score
data['self_exposure_score'] = np.sum(data[self_exposure_features], axis=1)
data['other_exposure_score'] = np.sum(data[other_exposure_features], axis=1)
data['war_exposure_score'] = np.sum(data[['self_exposure_score', 'other_exposure_score']], axis=1)

In [None]:
# CBCL features
CBCL_features = data.columns[240:283].to_list()

CBCL_D_features = CBCL_features[0:13]
CBCL_A_features = CBCL_features[13:31]
CBCL_S_features = CBCL_features[31:]


# Compute CBCL score
data['CBCL_D_score'] = np.sum(data[CBCL_D_features], axis=1)
data['CBCL_A_score'] = np.sum(data[CBCL_A_features], axis=1)
data['CBCL_S_score'] = np.sum(data[CBCL_S_features], axis=1)
data['CBCL_score'] = np.sum(data[['CBCL_D_score', 'CBCL_A_score', 'CBCL_S_score']], axis=1)

In [None]:
data.shape

In [None]:
# Compute length of texts
data['Event_length'] = data['Event'].str.len()
data['EER_text_length'] = data['EER_text'].str.len()

data['EER_text_length'] = data['EER_text_length'].fillna(0)


In [None]:
print(f"Event length mean: {data['Event_length'].mean():.5}, Event length SD: {data['Event_length'].std():.5}")
print(f"EER text length mean: {data['EER_text_length'].mean():.5}, EER text length SD: {data['EER_text_length'].std():.5}")

In [None]:
sns.boxplot(data=data, x='sex', y='Event_length')
plt.title("Text Length by Sex")
plt.show()

In [None]:
# Omitting NA's
data = data[np.sum(data[self_exposure_features + other_exposure_features + CBCL_features].isna(), axis=1) == 0]

In [None]:
data.shape

In [None]:
data['child_age'] = np.int16(data['child_age'])

In [None]:
ax = sns.boxplot(data=data, x='child_age', y='CBCL_score')



In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# child_gender == 1
sns.boxplot(
    data=data[data['child_gender'] == 1],
    x='child_age', y='CBCL_score',
    ax=axes[0]
)
axes[0].set_title('Child Gender = 1')

# child_gender == 2
sns.boxplot(
    data=data[data['child_gender'] == 2],
    x='child_age', y='CBCL_score',
    ax=axes[1]
)
axes[1].set_title('Child Gender = 2')

plt.tight_layout()
plt.show()

In [None]:
print(f"CBCL mean: {data['CBCL_score'].mean():.5}, CBCL SD: {data['CBCL_score'].std():.5}")
print(f"CBCL D mean: {data['CBCL_D_score'].mean():.5}, CBCL D SD: {data['CBCL_D_score'].std():.5}")
print(f"CBCL A mean: {data['CBCL_A_score'].mean():.5}, CBCL A SD: {data['CBCL_A_score'].std():.5}")
print(f"CBCL S mean: {data['CBCL_S_score'].mean():.5}, CBCL S SD: {data['CBCL_S_score'].std():.5}")


In [None]:
sns.pairplot(data[['self_exposure_score', 'other_exposure_score', 'war_exposure_score','CBCL_D_score', 'CBCL_A_score', 'CBCL_S_score', 'CBCL_score', 'Event_length', 'EER_text_length']])

In [None]:
corr_matrix = data[['self_exposure_score', 'other_exposure_score', 'war_exposure_score','CBCL_D_score', 'CBCL_A_score', 'CBCL_S_score', 'CBCL_score', 'Event_length', 'EER_text_length']].corr()

# Create the heatmap
plt.figure(figsize=(8, 6)) # Adjust figure size as needed
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# 1. Install Stanza (Run this once)
!pip install stanza

In [None]:
import stanza
import pandas as pd

# 2. Download the Hebrew model (Run this once)
# This downloads the pre-trained neural network for Hebrew
stanza.download('he')

# 3. Initialize the pipeline
# We need 'tokenize' (split words), 'mwt' (multi-word token expansion), and 'lemma' (base forms)
nlp = stanza.Pipeline(lang='he', processors='tokenize,mwt,pos,lemma', use_gpu=True)

# 4. Define the Lemmatization Function
def get_hebrew_lemmas(text):
    if not isinstance(text, str) or not text.strip():
        return []

    # Process the text with Stanza
    doc = nlp(text)

    lemmas = []
    # Stanza organizes text into sentences -> words
    for sent in doc.sentences:
        for word in sent.words:
            # 'word.lemma' is the base form (e.g., 'הלכנו' -> 'הלך')
            # We filter out punctuation manually if needed
            if word.upos != 'PUNCT':
                lemmas.append(word.lemma)
    return lemmas

# --- APPLY TO YOUR DATAFRAME ---
# Assuming your dataframe is called 'data' and the text column is 'text_column'

# (Replace 'text_column' with the actual name of your column, e.g., 'description')
column_name = 'Event'

# This might take a few minutes depending on the size of your data
print("Processing text... this may take a while for large datasets.")
data['lemmas'] = data[column_name].apply(get_hebrew_lemmas)

# --- INSPECT RESULTS ---
print(data[[column_name, 'lemmas']].head())

# Example of what this achieves:
# Input: "הילדים בכו כשהלכנו לממ"ד"
# Output (Lemmas): ['ה', 'ילד', 'בכה', 'כש', 'הלך', 'ל', 'ה', 'ממ"ד']
# Note how 'בכו' became 'בכה' and 'כשהלכנו' was split and normalized to 'הלך'.

In [None]:
# 1. Install Sentence Transformers (Run this once)
!pip install sentence_transformers

In [None]:
data = data[data['EER_text'].notna()]

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, r2_score
import re


# --- STEP A: Encode Text to Vectors ---
# We use a multilingual model that supports Hebrew well.
# This replaces the need to manually average Word2Vec vectors.
print("Loading Embedding Model...")
model_name = 'intfloat/multilingual-e5-base'
embedding_model = SentenceTransformer(model_name)


# Ensure we are using the original Hebrew text column (not the lemmas)
# for context-aware embeddings.
# REPLACE 'text_column' with your actual column name.
text_col = ['Event', 'EER_text']
target_col = 'CBCL_score'

# Encode the text
print("Encoding text features... (This may take a moment)")
# The model converts each text entry into a vector of numbers (e.g., 384 dimensions)
X_Event_embeddings = embedding_model.encode(data[text_col[0]].tolist(), show_progress_bar=True)
X_EER_embeddings = embedding_model.encode(data[text_col[1]].tolist(), show_progress_bar=True)

X_embeddings = np.hstack([X_Event_embeddings, X_EER_embeddings])

# def embed_hebrew(texts):
#     all_vecs = []
#     for t in texts:
#         parts = re.split(r"[.!?…]", t)
#         parts = [p.strip() for p in parts if len(p.strip()) > 5]

#         # E5 requires this prefix
#         parts = ["query: " + p for p in parts]

#         vecs = embedding_model.encode(
#             parts,
#             normalize_embeddings=True
#         )
#         all_vecs.append(vecs.mean(axis=0))
#     return np.vstack(all_vecs)


# X_embeddings = embed_hebrew(data["Event"].tolist())


# # Scale features, as Ridge is sensitive to feature scale
# from sklearn.preprocessing import normalize
# X_embeddings = normalize(X_embeddings)


# Prepare Target
y = data[target_col].values

# Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2)


In [None]:
# --- STEP B: Train Model with Hyperparameter Optimization ---

# We will use Ridge Regression (Linear Model).
# It works well with high-dimensional embedding data and prevents overfitting.
regressor = Ridge()

# Define Hyperparameters to tune
# alpha: Regularization strength (higher = stronger regularization)
param_grid = {
    "alpha": np.logspace(-4, 1, 25)
}


print("\n--- Tuning Hyperparameters (Grid Search) ---")
grid_search = GridSearchCV(
    Ridge(),
    param_grid,
    cv=10,
    scoring="r2"
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Train the best model on full training set (GridSearch does this automatically, but to be explicit)
best_model.fit(X_train, y_train)

# --- STEP C: Evaluate against Naive Predictor ---

# 1. The Naive Predictor (Baseline)
# "Naive" usually means predicting the Mean or Median for everyone.
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)

# 2. Make Predictions
y_pred_model = best_model.predict(X_test)
y_pred_naive = dummy_regr.predict(X_test)

# 3. Calculate Metrics
rmse_model = np.sqrt(mean_squared_error(y_test, y_pred_model))
rmse_naive = np.sqrt(mean_squared_error(y_test, y_pred_naive))

r2_model = r2_score(y_test, y_pred_model)
r2_naive = r2_score(y_test, y_pred_naive)

# 4. Display Results
print("\n" + "="*40)
print("      MODEL EVALUATION RESULTS      ")
print("="*40)
print(f"{'Metric':<20} | {'Naive Predictor':<15} | {'Tuned Linear Model':<15}")
print("-" * 56)
print(f"{'RMSE (Lower is better)':<20} | {rmse_naive:.4f}          | {rmse_model:.4f}")
print(f"{'R2 (Higher is better)':<20} | {r2_naive:.4f}          | {r2_model:.4f}")
print("-" * 56)

if rmse_model < rmse_naive:
    print("\nSUCCESS: Your text-based model outperforms the naive guess!")
else:
    print("\nRESULT: The model did not beat the naive guess. Consider different features or models.")

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=5,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
# 3. Calculate Metrics
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

r2_rf = r2_score(y_test, y_pred_rf)

# 4. Display Results
print("\n" + "="*40)
print("      MODEL EVALUATION RESULTS      ")
print("="*40)
print(f"{'Metric':<20} | {'Naive Predictor':<15} | {'Random Forest Model':<15}")
print("-" * 56)
print(f"{'RMSE (Lower is better)':<20} | {rmse_naive:.4f}          | {rmse_rf:.4f}")
print(f"{'R2 (Higher is better)':<20} | {r2_naive:.4f}          | {r2_rf:.4f}")
print("-" * 56)

if rmse_model < rmse_naive:
    print("\nSUCCESS: Your text-based model outperforms the naive guess!")
else:
    print("\nRESULT: The model did not beat the naive guess. Consider different features or models.")

In [None]:
# --- STEP A: Encode Text to Vectors ---
# We use a multilingual model that supports Hebrew well.
# This replaces the need to manually average Word2Vec vectors.
print("Loading Embedding Model...")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
embedding_model = SentenceTransformer(model_name)

# Ensure we are using the original Hebrew text column (not the lemmas)
# for context-aware embeddings.
# REPLACE 'text_column' with your actual column name.
text_col = 'lemmas'
target_col = 'CBCL_score'

# Encode the text
print("Encoding text features... (This may take a moment)")
# The model converts each text entry into a vector of numbers (e.g., 384 dimensions)
X_embeddings = embedding_model.encode(data[text_col].tolist(), show_progress_bar=True)

# Prepare Target
y = data[target_col].values

# Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# --- STEP B: Train Model with Hyperparameter Optimization ---

# We will use Ridge Regression (Linear Model).
# It works well with high-dimensional embedding data and prevents overfitting.
regressor = Ridge()

# Define Hyperparameters to tune
# alpha: Regularization strength (higher = stronger regularization)
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0, 200.0]
}

print("\n--- Tuning Hyperparameters (Grid Search) ---")
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Train the best model on full training set (GridSearch does this automatically, but to be explicit)
best_model.fit(X_train, y_train)

# --- STEP C: Evaluate against Naive Predictor ---

# 1. The Naive Predictor (Baseline)
# "Naive" usually means predicting the Mean or Median for everyone.
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)

# 2. Make Predictions
y_pred_model = best_model.predict(X_test)
y_pred_naive = dummy_regr.predict(X_test)

# 3. Calculate Metrics
rmse_model = np.sqrt(mean_squared_error(y_test, y_pred_model))
rmse_naive = np.sqrt(mean_squared_error(y_test, y_pred_naive))

r2_model = r2_score(y_test, y_pred_model)
r2_naive = r2_score(y_test, y_pred_naive)

# 4. Display Results
print("\n" + "="*40)
print("      MODEL EVALUATION RESULTS      ")
print("="*40)
print(f"{'Metric':<20} | {'Naive Predictor':<15} | {'Tuned Linear Model':<15}")
print("-" * 56)
print(f"{'RMSE (Lower is better)':<20} | {rmse_naive:.4f}          | {rmse_model:.4f}")
print(f"{'R2 (Higher is better)':<20} | {r2_naive:.4f}          | {r2_model:.4f}")
print("-" * 56)

if rmse_model < rmse_naive:
    print("\nSUCCESS: Your text-based model outperforms the naive guess!")
else:
    print("\nRESULT: The model did not beat the naive guess. Consider different features or models.")

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, r2_score

# --- CONFIGURATION ---
TEXT_COL_1 = 'event_description' # Replace with actual column name
TEXT_COL_2 = 'letter_to_self'    # Replace with actual column name
TARGET_COL = 'CBCL_score'

# 1. Load Model
print("Loading Embedding Model...")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
embedding_model = SentenceTransformer(model_name)

# 2. Encode Column 1 (Event)
print(f"Encoding {TEXT_COL_1}...")
# Fill NaNs with empty string to prevent errors
texts_1 = data[TEXT_COL_1].fillna('').tolist()
embeddings_1 = embedding_model.encode(texts_1, show_progress_bar=True)

# 3. Encode Column 2 (Letter)
print(f"Encoding {TEXT_COL_2}...")
texts_2 = data[TEXT_COL_2].fillna('').tolist()
embeddings_2 = embedding_model.encode(texts_2, show_progress_bar=True)

# 4. Concatenate Vectors
# Instead of one vector of size 384, we now have a combined vector of size 768 per person.
print("combining vectors...")
X_combined = np.hstack((embeddings_1, embeddings_2))

# Prepare Target
y = data[TARGET_COL].values

# 5. Split Data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# --- TRAIN & EVALUATE (Same as before) ---

regressor = Ridge()
param_grid = {'alpha': [0.1, 1.0, 10.0, 100.0, 200.0, 500.0]} # Added higher alpha for more features

print("\n--- Tuning Hyperparameters ---")
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Baseline
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train)

# Predict
y_pred = best_model.predict(X_test)
y_naive = dummy.predict(X_test)

# Metrics
rmse_val = np.sqrt(mean_squared_error(y_test, y_pred))
rmse_naive = np.sqrt(mean_squared_error(y_test, y_naive))
r2_val = r2_score(y_test, y_pred)

print("\n" + "="*40)
print("      EVALUATION RESULTS (Dual Vectors)      ")
print("="*40)
print(f"{'Metric':<20} | {'Naive':<15} | {'Dual-Vector Model':<15}")
print("-" * 56)
print(f"{'RMSE':<20} | {rmse_naive:.4f}          | {rmse_val:.4f}")
print(f"{'R2':<20} | {'0.0000'}          | {r2_val:.4f}")
print("-" * 56)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Use 'max_depth' to prevent overfitting
rf_model = RandomForestRegressor(n_estimators=120, max_depth=70, random_state=42)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
# Check R2 score for this new model

In [None]:

# Metrics
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred))
rmse_naive = np.sqrt(mean_squared_error(y_test, y_pred_naive))
r2_rf = r2_score(y_test, y_pred)

# 4. Display Results
print("\n" + "="*40)
print("      MODEL EVALUATION RESULTS      ")
print("="*40)
print(f"{'Metric':<20} | {'Naive Predictor':<15} | {'Random Forest Model':<15}")
print("-" * 56)
print(f"{'RMSE (Lower is better)':<20} | {rmse_naive:.4f}          | {rmse_rf:.4f}")
print(f"{'R2 (Higher is better)':<20} | {r2_naive:.4f}          | {r2_rf:.4f}")
print("-" * 56)

---

---

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud

# 1. Define Hebrew Stopwords & Prefixes to Ignore
# This list includes single-letter prefixes (handled by Stanza splitting)
# and common function words irrelevant to the semantic analysis.
ignore_words = {
    # Prefixes (often separated by Stanza)
    'ה', 'ו', 'ב', 'ל', 'ש', 'מ', 'כ', 'כש',
    # Pronouns & Function words
    'אני', 'את', 'אתה', 'אנחנו', 'הוא', 'היא', 'הם', 'הן',
    'זה', 'זאת', 'אלו', 'של', 'על', 'עם', 'כל', 'רק', 'אבל',
    'או', 'אם', 'גם', 'לא', 'כן', 'כי', 'אז', 'יותר', 'פחות',
    'היה', 'הייתה', 'היו', 'יש', 'אין', 'לי', 'לו', 'לה', 'לנו'
}

def filter_prefixes(lemma_list):
    """
    Filters out prefixes, single letters, and stopwords.
    Returns a clean list of content words.
    """
    if not isinstance(lemma_list, list):
        return []

    # Keep word if:
    # 1. It is not in the ignore list
    # 2. It is longer than 1 character (removes remaining detached prefixes)
    # 3. It is alphabetical (removes punctuation)
    return [word for word in lemma_list
            if word not in ignore_words
            and len(word) > 1
            and word.isalpha()]

# 2. Apply Filtering
# Assumes 'lemmas' is the column from the previous Stanza step
data['clean_content'] = data['lemmas'].apply(filter_prefixes)

# 3. Flatten list for global statistics
all_words = [word for sublist in data['clean_content'] for word in sublist]
word_counts = Counter(all_words)

# --- VISUALIZATION 1: Top 20 Most Frequent Content Words ---

# Get top 20 words
top_words = word_counts.most_common(20)
words = [w[0] for w in top_words]
counts = [w[1] for w in top_words]

# Reverse Hebrew strings for correct display in Matplotlib
words_reversed = [w[::-1] for w in words]

plt.figure(figsize=(12, 6))
sns.barplot(x=counts, y=words_reversed, palette='viridis')
plt.title('Top 20 Frequent Content Words (Prefixes Removed)')
plt.xlabel('Frequency')
plt.show()

# --- VISUALIZATION 2: Word Cloud ---

# Prepare text for WordCloud (Reversing strings individually for display)
# We join them with spaces
text_for_cloud = " ".join([w[::-1] for w in all_words])

# Generate
wc = WordCloud(width=800, height=400, background_color='white', font_path=None).generate(text_for_cloud)

plt.figure(figsize=(15, 7))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of Semantic Content")
plt.show()

# --- VISUALIZATION 3: Distribution of Description Length ---
# This checks how much "content" (meaningful words) participants wrote.
# Short responses might correlate with avoidance or lower engagement.

data['content_length'] = data['clean_content'].apply(len)

plt.figure(figsize=(10, 5))
sns.histplot(data['content_length'], bins=30, kde=True, color='purple')
plt.title('Distribution of Content Word Count per Response')
plt.xlabel('Number of Meaningful Words')
plt.ylabel('Number of Participants')
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# 1. Define specific words to investigate (Lemmatized forms)
# These align with the CBCL subscales mentioned in your papers.
target_words = [
    'פחד',    # Fear (Internalizing)
    'בכה',    # Cry (Internalizing)
    'כעס',    # Anger (Externalizing)
    'צעק',    # Shout (Externalizing)
    'מצב',  # Hit (Externalizing)
    'זמן',   # Cling (Internalizing/Anxiety)
    'לבד',    # Alone (Isolation)
    'קשה'     # Hard/Difficult (General Distress)
]

# 2. Extract Features for these words only
# We assume 'clean_content' contains the list of lemmas from the previous step.
# If you only have the text column, use data['text_column'] and searching strings.

analysis_data = []

for index, row in data.iterrows():
    # Flatten the list of lemmas for this participant
    lemmas = row['clean_content']

    # Base dictionary with the target score
    row_stats = {
        'CBCL_score': row['CBCL_score'] # Ensure this matches your column name exactly
    }

    # Calculate stats for each target word
    for word in target_words:
        count = lemmas.count(word)
        row_stats[f'count_{word}'] = count
        row_stats[f'has_{word}'] = "Yes" if count > 0 else "No"

    analysis_data.append(row_stats)

df_analysis = pd.DataFrame(analysis_data)

# --- VISUALIZATION 1: IMPACT OF APPEARANCE (Box Plots) ---
# Question: "Is the CBCL score higher when the word appears?"

plt.figure(figsize=(15, 10))

# We create subplots for each word
for i, word in enumerate(target_words):
    plt.subplot(2, 4, i+1) # Adjust grid size (2 rows, 4 columns) based on list length

    sns.boxplot(x=f'has_{word}', y='CBCL_score', data=df_analysis,
                order=['No', 'Yes'], palette={'No': 'skyblue', 'Yes': 'salmon'})

    plt.title(f'Word: {word[::-1]}') # Reverse Hebrew for title
    plt.xlabel('')
    plt.ylabel('CBCL Score' if i % 4 == 0 else '') # Only show label on left plots

plt.suptitle('Impact of Word Appearance on CBCL Score', fontsize=16)
plt.tight_layout()
plt.show()

# --- VISUALIZATION 2: IMPACT OF INCREASING FREQUENCY (Line/Bar Plots) ---
# Question: "Does the score go up as the word is used more?"

plt.figure(figsize=(15, 6))

# Choose just one or two strong words to visualize detailed frequency trends
# Otherwise the plot gets too messy. Let's look at 'fear' (פחד) and 'anger' (כעס).
focus_words = ['פחד', 'כעס']

for word in focus_words:
    # Calculate Mean CBCL score for each frequency (0 times, 1 time, 2 times...)
    # We group 3+ occurrences together to avoid noisy outliers with small samples.

    df_analysis[f'freq_group_{word}'] = df_analysis[f'count_{word}'].apply(lambda x: str(x) if x < 3 else '3+')

    # Sort order for x-axis
    order = ['0', '1', '2', '3+']

    # Group and plot
    sns.lineplot(x=f'freq_group_{word}', y='CBCL_score', data=df_analysis,
                 marker='o', label=word[::-1], errorbar='se') # 'se' shows Standard Error confidence interval

plt.title('Mean CBCL Score vs. Word Frequency')
plt.xlabel('Number of times word appears in text')
plt.ylabel('Mean CBCL Score')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# --- STATISTICAL CHECK (T-Test) ---
# Quickly check if the difference between "Yes" and "No" groups is significant
from scipy.stats import ttest_ind

print("--- Statistical Significance (T-Test: Word Present vs. Absent) ---")
for word in target_words:
    group_yes = df_analysis[df_analysis[f'has_{word}'] == 'Yes']['CBCL_score']
    group_no = df_analysis[df_analysis[f'has_{word}'] == 'No']['CBCL_score']

    if len(group_yes) > 0 and len(group_no) > 0:
        t_stat, p_val = ttest_ind(group_yes, group_no, equal_var=False)
        sig = "**" if p_val < 0.05 else ""
        print(f"Word '{word[::-1]}': p-value = {p_val:.4f} {sig}")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import ttest_ind

# 1. Identify the Top 20 Most Frequent Words
# We use the 'clean_content' column (lemmas without prefixes) from the previous step.
# Ensure you run the "filter_prefixes" step before this.
all_lemmas = [word for sublist in data['clean_content'] for word in sublist]
top_20_words = [word for word, count in Counter(all_lemmas).most_common(20)]

# 2. Calculate Statistics for Each Word
analysis_data = []
stats_list = []

print(f"{'Word':<15} | {'Mean Score (Present)':<20} | {'Mean Score (Absent)':<20} | {'P-Value':<10}")
print("-" * 80)

for word in top_20_words:
    # Create a mask: True if the word appears in the parent's text, False otherwise
    has_word = data['clean_content'].apply(lambda x: word in x)

    # Split the CBCL scores into two groups
    scores_present = data.loc[has_word, 'CBCL_score']
    scores_absent = data.loc[~has_word, 'CBCL_score']

    # Calculate Means
    mean_present = scores_present.mean()
    mean_absent = scores_absent.mean()

    # Perform T-Test (only if we have enough data points, e.g., >5 in each group)
    if len(scores_present) > 5 and len(scores_absent) > 5:
        t_stat, p_val = ttest_ind(scores_present, scores_absent, equal_var=False)
    else:
        p_val = 1.0 # Not significant if data is insufficient

    # Store data for plotting
    diff = mean_present - mean_absent  # Positive diff = Word linked to HIGHER score
    stats_list.append({
        'word': word,
        'mean_present': mean_present,
        'mean_absent': mean_absent,
        'diff': diff,
        'p_val': p_val
    })

    # Print simplified table
    sig_mark = "**" if p_val < 0.05 else ""
    print(f"{word[::-1]:<15} | {mean_present:<20.2f} | {mean_absent:<20.2f} | {p_val:.4f} {sig_mark}")

# 3. Visualization: Difference in Means
stats_df = pd.DataFrame(stats_list)
stats_df = stats_df.sort_values('diff', ascending=False) # Sort: Risk words on top

plt.figure(figsize=(12, 8))

# Color coding: Red for significant Risk, Green for significant Protective, Gray for insignificant
colors = []
for _, row in stats_df.iterrows():
    if row['p_val'] < 0.05:
        # Significant
        colors.append('salmon' if row['diff'] > 0 else 'lightgreen')
    else:
        # Not Significant
        colors.append('lightgray')

# Reverse Hebrew labels for correct display
labels_reversed = [w[::-1] for w in stats_df['word']]

# Create Bar Plot
sns.barplot(x='diff', y=labels_reversed, data=stats_df, palette=colors)

plt.axvline(0, color='black', linestyle='--')
plt.title('Impact of Top 20 Common Words on CBCL Score\n(Difference in Mean Score: Present vs. Absent)', fontsize=14)
plt.xlabel('Difference in CBCL Score Points (Positive = Risk, Negative = Protective)')
plt.ylabel('Word')

# Manual Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='salmon', label='Significantly Higher Score (Risk Factor)'),
    Patch(facecolor='lightgreen', label='Significantly Lower Score (Resilience Factor)'),
    Patch(facecolor='lightgray', label='Not Significant')
]
plt.legend(handles=legend_elements, loc='lower right')

plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# --- CONFIGURATION ---
TARGET_COL = 'CBCL_score'  # <--- REPLACE with your actual score column name
MIN_DOC_FREQ = 5           # Ignore words appearing in fewer than 5 responses

# 1. Prepare Data for Vectorization
# CountVectorizer expects a list of strings (sentences), not a list of lists.
# We join your filtered lemmas back into strings.
corpus = data['clean_content'].apply(lambda x: ' '.join(x))

# 2. Vectorize (Create the "Bag of Words")
# We only keep words that appear in at least MIN_DOC_FREQ documents to filter noise.
vectorizer = CountVectorizer(min_df=MIN_DOC_FREQ)
X = vectorizer.fit_transform(corpus)

# Create a DataFrame of word counts
word_counts_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# 3. Calculate Correlations
# We assume the index of word_counts_df matches 'data'. Reset index to be safe.
word_counts_df.index = data.index

# Add the target score to the word dataframe temporarily to calculate corr
word_counts_df['__TARGET__'] = data[TARGET_COL]

# Calculate correlation of every word with the target
correlations = word_counts_df.corr()['__TARGET__'].drop('__TARGET__')

# 4. Extract Top Positive and Negative Correlations
top_positive = correlations.sort_values(ascending=False).head(10)
top_negative = correlations.sort_values(ascending=True).head(10)

print("--- Top Words Associated with HIGH CBCL Scores (Risk?) ---")
print(top_positive)
print("\n--- Top Words Associated with LOW CBCL Scores (Resilience?) ---")
print(top_negative)

# --- VISUALIZATION: Correlation Bar Plot ---

# Combine top pos/neg for a single plot
top_corr = pd.concat([top_positive, top_negative])

plt.figure(figsize=(10, 8))

# Reverse Hebrew labels for display
labels_reversed = [w[::-1] for w in top_corr.index]

sns.barplot(x=top_corr.values, y=labels_reversed, palette='coolwarm')
plt.title(f'Correlation between Word Usage and {TARGET_COL}', fontsize=14)
plt.xlabel('Pearson Correlation Coefficient')
plt.axvline(0, color='black', linestyle='--')
plt.show()

# --- VISUALIZATION: Specific Word Impact (Box Plot) ---
# Let's verify if a specific "high risk" word actually separates the groups.
# We take the #1 most positively correlated word.

if not top_positive.empty:
    risk_word = top_positive.index[0] # The word with highest correlation

    # Create a binary column: Did the parent use this word? Yes/No
    data['has_risk_word'] = data['clean_content'].apply(lambda x: risk_word in x)

    plt.figure(figsize=(8, 6))
    sns.boxplot(data=data, x='has_risk_word', y=TARGET_COL, palette='Set2')
    plt.title(f'Effect of using the word "{risk_word[::-1]}" on {TARGET_COL}')
    plt.xlabel(f'Contains word "{risk_word[::-1]}"?')
    plt.ylabel('CBCL Score')
    plt.show()

In [None]:
data['Event_words_lst'] = data['Event'].apply(clean_text_basic).apply(tokenize_hebrew)


data['Event_words_lst']

In [None]:
data['Event_top_10_words'] = data['Event_words_lst'].apply(get_top_n_words)

data['Event_top_10_words']

In [None]:
from collections import Counter
from itertools import chain

# Flatten all words in Event_top_10_words
# Each row is a list of (word, count) tuples, we only take the word
all_top_words = list(chain.from_iterable([[w for w, c in lst] for lst in data['Event_top_10_words']]))

# Count frequency
word_counter = Counter(all_top_words)

# Get the 10 most common words
top10_words_overall = word_counter.most_common(10)

print("Top 10 most common words across all events:")
for word, freq in top10_words_overall:
    print(f"{word}: {freq}")

In [None]:
all_events_txt = ''

for txt in data['Event']:
  all_events_txt += txt

In [None]:
get_top_n_words(tokenize_hebrew(all_events_txt), n=10)

In [None]:

# List of top 10 words
top_words = ["ילדים","שלי","בית","אזעקה","חוסר","ילד","חמה","הבן","הבת","לי","שאני","מצב","שלא"]

# Create binary columns: 1 if word appears in Event_words_lst, 0 otherwise
for word in top_words:
    data[f'word_{word}'] = data['Event_words_lst'].apply(lambda lst: int(word in lst))

# Compare mean CBCL for events with vs without each word
comparison = []
for word in top_words:
    col = f'word_{word}'
    mean_with = data.loc[data[col] == 1, 'CBCL_score'].mean()
    mean_without = data.loc[data[col] == 0, 'CBCL_score'].mean()
    comparison.append({
        'word': word,
        'CBCL_with_word': mean_with,
        'CBCL_without_word': mean_without
    })

comparison_df = pd.DataFrame(comparison)

# Melt for plotting
comparison_melted = comparison_df.melt(
    id_vars='word',
    value_vars=['CBCL_with_word', 'CBCL_without_word'],
    var_name='Condition', value_name='CBCL_score'
)

# Clean names for plot
comparison_melted['Condition'] = comparison_melted['Condition'].map({
    'CBCL_with_word': 'With Word',
    'CBCL_without_word': 'Without Word'
})

# Plot
plt.figure(figsize=(10,6))
sns.barplot(data=comparison_melted, x='CBCL_score', y='word', hue='Condition')
plt.xlabel('Mean CBCL Score')
plt.ylabel('Word')
plt.title('CBCL Scores by Presence of Top 10 Event Words')
plt.legend(title='')
plt.tight_layout()
plt.show()


In [None]:

# Create binary columns: 1 if word appears in Event_words_lst, 0 otherwise
for word in top_words:
    data[f'word_{word}'] = data['Event_words_lst'].apply(lambda lst: int(word in lst))

# Compare mean war_exposure_score for events with vs without each word
comparison = []
for word in top_words:
    col = f'word_{word}'
    mean_with = data.loc[data[col] == 1, 'war_exposure_score'].mean()
    mean_without = data.loc[data[col] == 0, 'war_exposure_score'].mean()
    comparison.append({
        'word': word,
        'WarExposure_with_word': mean_with,
        'WarExposure_without_word': mean_without
    })

comparison_df = pd.DataFrame(comparison)

# Melt for plotting
comparison_melted = comparison_df.melt(
    id_vars='word',
    value_vars=['WarExposure_with_word', 'WarExposure_without_word'],
    var_name='Condition', value_name='WarExposure_score'
)

# Clean names for plot
comparison_melted['Condition'] = comparison_melted['Condition'].map({
    'WarExposure_with_word': 'With Word',
    'WarExposure_without_word': 'Without Word'
})

# Plot
plt.figure(figsize=(10,6))
sns.barplot(data=comparison_melted, x='WarExposure_score', y='word', hue='Condition')
plt.xlabel('Mean War Exposure Score')
plt.ylabel('Word')
plt.title('War Exposure Scores by Presence of Top 10 Event Words')
plt.legend(title='')
plt.tight_layout()
plt.show()



---


---