In [None]:
pip install twython

In [None]:
pip install sentence-transformers

In [None]:
pip install vaderSentiment

# Feature Extraction and Selection

## Data Preprocessing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

In [None]:
# Set the local folder path
folder_path = r'/kaggle/input/datasets-transcripts/VetTrain_Transcripts'

# Function to extract the numerical part from the filename
def extract_pid(filename):
    base_name = os.path.splitext(filename)[0]
    return base_name.split('_')[0]  # Assuming filename is like "P001_transcript.csv"

# Get all CSV files in the folder and sort them numerically by filename
all_files = sorted(
    [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')],
    key=lambda x: int(extract_pid(os.path.basename(x))[1:])
)

In [None]:
import re

def clean_text(text):
    # lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Delete redundant Spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# Initialize the final storage for combined question pairs
combined_data = []

# Process each file
for file_path in all_files:
    # Extract PID from filename
    pid = extract_pid(os.path.basename(file_path))

    # Read the CSV file
    df = pd.read_csv(file_path)
    df = df[df['Type'] != 'IRR']  # Filter irrelevant rows

    # Initialize dialogue extraction
    current_dialogue = []
    veteran_dialogue = []
    current_question_id = None
    qid_counter = 1  # Start QID counter for each file

    # Extract question pairs with PID and QID
    for _, row in df.iterrows():
        if row['Type'].startswith('Q'):
            question_id = row['Type']
            if current_question_id is None:
                current_question_id = question_id
                current_dialogue = [row['Transcript']]
                veteran_dialogue = []
            elif question_id != current_question_id:
                combined_data.append({
                    'PID': pid,
                    'QID': f"Q{qid_counter}",
                    'Combined_Transcript': " ".join(current_dialogue),
                    'veteran_transcript':" ".join(veteran_dialogue)
                })
                qid_counter += 1
                current_dialogue = [row['Transcript']]
                veteran_dialogue = []
                current_question_id = question_id
            else:
                current_dialogue.append(row['Transcript'])
        else:
            current_dialogue.append(row['Transcript'])
            if row['Type']=='BCV' or row['Type'].startswith('A'):
              veteran_dialogue.append(row['Transcript'])


    # Add the last dialogue for the file
    if current_dialogue:
        combined_data.append({
            'PID': pid,
            'QID': f"Q{qid_counter}",
            'Combined_Transcript': " ".join(current_dialogue),
            'veteran_transcript':" ".join(veteran_dialogue)
        })

In [None]:
# Convert to a DataFrame
transcripts_df = pd.DataFrame(combined_data)

In [None]:
# Load the behavioral annotation codes
behavior_file = r'/kaggle/input/ba-codes-dataset/Behavioral Annotation Codes.csv'
df_behavior = pd.read_csv(behavior_file)

# Merge behavioral codes
transcripts_df = transcripts_df.merge(df_behavior, on=['PID', 'QID'], how='left')

In [None]:
transcripts_df['Cleaned_Transcript'] = transcripts_df['Combined_Transcript'].apply(clean_text)
transcripts_df['Cleaned_veteran_transcript'] = transcripts_df['veteran_transcript'].apply(clean_text)

In [None]:
transcripts_df = transcripts_df[['PID', 'Cleaned_veteran_transcript']]
transcripts_df

## Extracting features and adding it to the main dataset

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag, word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
def add_tfidf_features(df, text_column, max_features=500):
    """
    Adds TF-IDF features to the dataset.

    Parameters:
        df (pd.DataFrame): Input dataframe containing the text data.
        text_column (str): Name of the column containing text data.
        max_features (int): Maximum number of TF-IDF features to generate (default=500).
    
    Returns:
        pd.DataFrame: Dataframe with TF-IDF features added.
    """
    # Initialize TF-IDF Vectorizer
    tfidf = TfidfVectorizer(max_features=max_features)
    
    # Fit and transform the text data
    tfidf_matrix = tfidf.fit_transform(df[text_column])
    
    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out(), index=df.index)
    
    # Concatenate the TF-IDF features with the original DataFrame
    df_with_tfidf = pd.concat([df, tfidf_df], axis=1)
    
    return df_with_tfidf

In [None]:
# Example usage
transcripts_df = add_tfidf_features(transcripts_df, text_column="Cleaned_veteran_transcript", max_features=2000)
transcripts_df.head()

In [None]:
def add_pos_tags(df, text_column):
    """
    Adds POS tagging to the dataframe.
    
    Parameters:
        df (pd.DataFrame): Input dataframe containing text data.
        text_column (str): Name of the column containing text data.
    
    Returns:
        pd.DataFrame: Dataframe with added POS tags.
    """
    def pos_tags(text):
        tokens = word_tokenize(text)
        tags = pos_tag(tokens)
        return {tag: len([word for word, pos in tags if pos == tag]) for tag in set([pos for _, pos in tags])}
    
    df['POS_Tags'] = df[text_column].apply(pos_tags)
    return df

In [None]:
transcripts_df = add_pos_tags(transcripts_df, text_column="Cleaned_veteran_transcript")
transcripts_df.head(10)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def add_sentiment_scores(df, text_column):
    """
    Adds sentiment scores as separate columns to the dataframe.
    
    Parameters:
        df (pd.DataFrame): Input dataframe containing text data.
        text_column (str): Name of the column containing text data.
    
    Returns:
        pd.DataFrame: Dataframe with added sentiment scores as separate columns.
    """
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = df[text_column].apply(lambda text: sia.polarity_scores(text))
    
    # Create separate columns for each sentiment score
    df['Sentiment_Neg'] = sentiment_scores.apply(lambda score: score['neg'])
    df['Sentiment_Neu'] = sentiment_scores.apply(lambda score: score['neu'])
    df['Sentiment_Pos'] = sentiment_scores.apply(lambda score: score['pos'])
    df['Sentiment_Compound'] = sentiment_scores.apply(lambda score: score['compound'])
    
    return df

In [None]:
# Adding sentiment scores as separate columns to the dataframe
transcripts_df = add_sentiment_scores(transcripts_df, text_column="Cleaned_veteran_transcript")

# Example output for the first row
print(transcripts_df[['Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos', 'Sentiment_Compound']].iloc[0])

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

def add_word_embeddings(df, text_column, model_name='all-MiniLM-L6-v2'):
    """
    Adds word embeddings to the dataframe using Sentence Transformers.
    
    Parameters:
        df (pd.DataFrame): Input dataframe containing text data.
        text_column (str): Name of the column containing text data.
        model_name (str): Sentence Transformer model name.
    
    Returns:
        pd.DataFrame: Dataframe with added embeddings.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(df[text_column].tolist(), show_progress_bar=True)
    df['Embeddings'] = list(embeddings)
    return df

In [None]:
transcripts_df = add_word_embeddings(transcripts_df, text_column="Cleaned_veteran_transcript")
transcripts_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(transcripts_df['PID'])
transcripts_df['PID'] = target_encoded

In [None]:
transcripts_df.head()

In [None]:
# create a dataframe that contains the values for every POS tag in every sample
pos_tags_df = pd.json_normalize(transcripts_df['POS_Tags'])
pos_tags_df.fillna(0, inplace=True)  # Replace NaN with 0

In [None]:
pos_tags_df.shape

In [None]:
# Reset indices before concatenation
transcripts_df = transcripts_df.reset_index(drop=True)
pos_tags_df = pos_tags_df.reset_index(drop=True)

In [None]:
# concatenate the two dataframes
transcripts_df = pd.concat([transcripts_df, pos_tags_df], axis=1)
transcripts_df.drop(columns=['POS_Tags'], inplace=True)

In order to filter the extracted features, we also need to handle the `Embeddings` column in such a way that each value corresponds to a single feature column.

In [None]:
# Expand Embeddings list into individual columns
embeddings_df = pd.DataFrame(transcripts_df['Embeddings'].to_list(), index=transcripts_df.index)
embeddings_df.columns = [f'Embedding_{i}' for i in range(embeddings_df.shape[1])]

In [None]:
transcripts_df = pd.concat([transcripts_df, embeddings_df], axis=1)
transcripts_df.drop(columns=['Embeddings'], inplace=True)
transcripts_df.head()

In [None]:
transcripts_df.isna().sum().sum()

## Feature Selection

### Removing correlated features

In [None]:
corr_matrix = transcripts_df.drop(['PID','Cleaned_veteran_transcript'],axis=1).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [None]:
threshold = 0.8
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

In [None]:
to_drop

In [None]:
transcripts_df.drop(to_drop,axis=1,inplace=True)

In [None]:
transcripts_df.head()

### Feature selection based on mutual information scores

In [None]:
exclude_columns = ['PID', 'Cleaned_veteran_transcript']

# Get all feature column names except the specified ones
feature_columns = [col for col in transcripts_df.columns if col not in exclude_columns]

In [None]:
len(feature_columns)

In [None]:
from sklearn.feature_selection import mutual_info_classif

def select_features_with_mutual_info(df, feature_columns, target_column, k_values):
    """
    Selects top features for multiple values of k based on mutual information.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        feature_columns (list): List of feature column names.
        target_column (str): Name of the target column.
        k_values (list): List of different k values to experiment with.
    
    Returns:
        dict: Dictionary where keys are k values and values are lists of top k features.
    """
    X = df[feature_columns]
    y = df[target_column]
    
    # Calculate mutual information scores once
    mi_scores = mutual_info_classif(X, y, random_state=42)
    feature_scores = pd.Series(mi_scores, index=feature_columns)
    
    # Sort features by mutual information scores in descending order
    sorted_features = feature_scores.sort_values(ascending=False).index.tolist()
    
    # Generate top k features for all k values
    results = {k: sorted_features[:k] for k in k_values}
    
    return results

In [None]:
filtered_features = select_features_with_mutual_info(
    transcripts_df, 
    feature_columns=feature_columns, 
    target_column='PID', 
    k_values=[100, 200, 400, 800]
)

In [None]:
# Print results for each k
for k, features in filtered_features.items():
    print(f"Top {k} features: {features} \n")

# Splitting the dataset in 5 folds

In [None]:
from sklearn.model_selection import StratifiedKFold

def random_stratified_split(df, feature_columns, target_column, n_splits=5):
    """
    Splits the dataset into stratified random folds using StratifiedKFold.
    
    Parameters:
        df (pd.DataFrame): The input dataset containing features and target columns.
        feature_columns (list): List of column names to be used as features.
        target_column (str): Name of the target column.
        n_splits (int): Number of folds (default is 5).
    
    Returns:
        list of tuples: Each tuple contains (X_train, X_test, y_train, y_test) for one fold.
    """
    # Extract features and target
    X = df[feature_columns]
    y = df[target_column]
    
    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Store splits
    splits = []
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        splits.append((X_train, X_test, y_train, y_test))
    
    return splits

In [None]:
target_column = 'PID'

# Perform stratified random splitting
splits = random_stratified_split(transcripts_df, feature_columns, target_column, n_splits=5)

# Print results
for i, (X_train, X_test, y_train, y_test) in enumerate(splits):
    print(f"Fold {i+1}")
    print("X_train: ", X_train.shape)
    print("X_test: ", X_test.shape)
    print("y_train: ", y_train.shape)
    print("y_test: ", y_test.shape)
    print()

# Running Tree-Based ML Models

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score

def evaluate_tree_models_on_splits(df, feature_columns, target_column, k_features, top_features_dict, n_splits=5):
    """
    Evaluates three tree-based models (Decision Tree, Random Forest, Gradient Boosting)
    with hyperparameter tuning for each k value using StratifiedKFold splits.

    Parameters:
        df (pd.DataFrame): The input dataset containing features and the target column.
        feature_columns (list): List of feature column names.
        target_column (str): Name of the target column.
        k_features (list): List of k values representing the number of top features to use.
        top_features_dict (dict): Dictionary mapping each k value to the list of top features.
        n_splits (int): Number of StratifiedKFold splits (default is 5).

    Returns:
        dict: Results for each k value containing the best model and its performance metrics.
    """
    # Define the models and their hyperparameter grids
    models = {
        "Decision Tree": {
            "model": DecisionTreeClassifier(random_state=42),
            "param_grid": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]},
        },
        "Random Forest": {
            "model": RandomForestClassifier(random_state=42),
            "param_grid": {
                "n_estimators": [50, 100, 200],
                "max_depth": [3, 5, 10, None],
                "min_samples_split": [2, 5, 10],
            },
        }
    }

    results = {}

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Iterate over each k value
    for k in k_features:
        print(f"Evaluating models for top {k} features...")
        top_features = top_features_dict[k]  # Get the top k features

        best_model_name = None
        best_model = None
        best_accuracy = 0
        best_balanced_accuracy = 0
        best_params = None

        # Iterate over models
        for model_name, model_info in models.items():
            print(f"Training {model_name}...")
            total_accuracy = 0
            total_balanced_accuracy = 0

            # Iterate over StratifiedKFold splits
            for train_idx, test_idx in skf.split(df[feature_columns], df[target_column]):
                X_train, X_test = df.iloc[train_idx][top_features], df.iloc[test_idx][top_features]
                y_train, y_test = df.iloc[train_idx][target_column], df.iloc[test_idx][target_column]

                # Perform grid search
                grid_search = GridSearchCV(
                    model_info["model"],
                    model_info["param_grid"],
                    scoring="accuracy",
                    cv=3,
                    n_jobs=-1,
                )
                grid_search.fit(X_train, y_train)

                # Evaluate on test set
                best_estimator = grid_search.best_estimator_
                y_pred = best_estimator.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

                # Accumulate scores
                total_accuracy += accuracy
                total_balanced_accuracy += balanced_accuracy

            # Average scores over all splits
            avg_accuracy = total_accuracy / n_splits
            avg_balanced_accuracy = total_balanced_accuracy / n_splits

            print(f"{model_name}: Accuracy={avg_accuracy:.4f}, Balanced Accuracy={avg_balanced_accuracy:.4f} \n")

            # Update best model if this model performs better
            if (avg_accuracy > best_accuracy) and (avg_balanced_accuracy > best_balanced_accuracy):
                best_model_name = model_name
                best_model = grid_search.best_estimator_
                best_accuracy = avg_accuracy
                best_balanced_accuracy = avg_balanced_accuracy
                best_params = grid_search.best_params_

        # Store results for this k value
        results[k] = {
            "Best Model": best_model_name,
            "Best Accuracy": best_accuracy,
            "Best Balanced Accuracy": best_balanced_accuracy,
            "Best Parameters": best_params,
            "Best Model Object": best_model,
        }

    return results

In [None]:
k_values_list = list(filtered_features.keys())
print(k_values_list)

In [None]:
# Run the function
results = evaluate_tree_models_on_splits(transcripts_df, feature_columns, target_column, k_values_list, filtered_features)

# Display the best model for each k
for k, result in results.items():
    print(f"Top {k} features:")
    print(f"Best Model: {result['Best Model']}")
    print(f"Best Accuracy: {result['Best Accuracy']:.4f}")
    print(f"Best Balanced Accuracy: {result['Best Balanced Accuracy']:.4f}")
    print(f"Best Parameters: {result['Best Parameters']}")
    print("\n")

In [None]:
import joblib

# Iterate over results to save the best model for each k
for k, result in results.items():
    best_model = result["Best Model Object"]  # Get the best model object
    filename = f"partb_best_ml_model_top_{k}_features.pkl"
    joblib.dump(best_model, filename)
    print(f"Saved best model for top {k} features as '{filename}'.")

# Running DL Models

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from sklearn.metrics import accuracy_score, balanced_accuracy_score

def train_and_evaluate_dl_models_on_stratified_splits(stratified_splits, top_features_dict, model_type="Conv1D"):
    """
    Trains and evaluates Conv1D or LSTM models on stratified splits without hyperparameter tuning.
    Dataset loading is handled via TensorFlow tensor slices.

    Parameters:
        stratified_splits (list of tuples): Train-test splits from StratifiedKFold.
        top_features_dict (dict): Dictionary of top features for each k value.
        model_type (str): "Conv1D" or "LSTM".

    Returns:
        dict: Results for each k value containing the model and its performance metrics.
    """
    results = {}

    for k, top_features in top_features_dict.items():
        print(f"Evaluating models for top {k} features...")
        
        best_model = None
        best_accuracy = 0
        best_balanced_accuracy = 0

        # Iterate over splits
        for split_idx, (X_train, X_test, y_train, y_test) in enumerate(stratified_splits):
            print(f"Training on Split {split_idx + 1}...")

            # Filter dataset for top features
            X_train_k = X_train[top_features]
            X_test_k = X_test[top_features]

            # Prepare TensorFlow datasets
            train_ds = tf.data.Dataset.from_tensor_slices((X_train_k.values, y_train.values)).batch(32).prefetch(tf.data.AUTOTUNE)
            test_ds = tf.data.Dataset.from_tensor_slices((X_test_k.values, y_test.values)).batch(32).prefetch(tf.data.AUTOTUNE)

            # Prepare input shape
            input_shape = (len(top_features), 1)  # (timesteps, features)

            # Build model
            if model_type == "Conv1D":
                model = Sequential([
                    Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape),
                    MaxPooling1D(pool_size=2),
                    Dropout(0.2),
                    Flatten(),
                    Dense(32, activation='relu'),
                    Dense(1, activation='sigmoid')
                ])
            elif model_type == "LSTM":
                model = Sequential([
                    LSTM(64, return_sequences=True, input_shape=input_shape),
                    Dropout(0.2),
                    LSTM(32),
                    Dense(1, activation='sigmoid')
                ])
            else:
                raise ValueError("Invalid model_type. Choose 'Conv1D' or 'LSTM'.")

            # Compile model
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

            # Train model
            model.fit(train_ds, epochs=20, verbose=0)
            print("Evaluating on the test dataset....")

            # Evaluate on test set
            y_pred = (model.predict(test_ds) > 0.5).astype(int)
            acc = accuracy_score(y_test, y_pred)
            bal_acc = balanced_accuracy_score(y_test, y_pred)

            print(f"Split {split_idx + 1}: Accuracy={acc:.4f}, Balanced Accuracy={bal_acc:.4f}\n")

            # Update best model if this model performs better
            if acc > best_accuracy and bal_acc > best_balanced_accuracy:
                best_model = model
                best_accuracy = acc
                best_balanced_accuracy = bal_acc

        # Store results for this k value
        results[k] = {
            "Best Model": best_model,
            "Best Accuracy": best_accuracy,
            "Best Balanced Accuracy": best_balanced_accuracy,
        }
        print(f"Best Model for top {k} features: Accuracy={best_accuracy:.4f}, Balanced Accuracy={best_balanced_accuracy:.4f}\n")

    return results

In [None]:
# Call the function for Conv1D
conv1d_results = train_and_evaluate_dl_models_on_stratified_splits(
    stratified_splits=splits,
    top_features_dict=filtered_features,
    model_type="Conv1D"
)

# Display results
for k, result in conv1d_results.items():
    print(f"Top {k} features:")
    print(f"Best Model: {result['Best Model']}")
    print(f"Best Accuracy: {result['Best Accuracy']:.4f}")
    print(f"Best Balanced Accuracy: {result['Best Balanced Accuracy']:.4f}\n")

In [None]:
# Call the function for Conv1D
lstm_results = train_and_evaluate_dl_models_on_stratified_splits(
    stratified_splits=splits,
    top_features_dict=filtered_features,
    model_type="LSTM"
)

# Display results
for k, result in lstm_results.items():
    print(f"Top {k} features:")
    print(f"Best Model: {result['Best Model']}")
    print(f"Best Accuracy: {result['Best Accuracy']:.4f}")
    print(f"Best Balanced Accuracy: {result['Best Balanced Accuracy']:.4f}\n")