In [None]:
pip install twython

In [None]:
pip install sentence-transformers

In [None]:
pip install vaderSentiment

# Extracting Language Features

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ba-codes/Behavioral Annotation Codes.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P007_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P040_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P001_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P016_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P039_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P030_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P010_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P029_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P011_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P023_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P037_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P006_transcript.csv
/kaggle/input/veteran-titles/VetTrain_Transcripts/P008_transcript.csv
/kaggle/input/veteran-titles/VetTra

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Data Preparation

In [3]:
# Set the local folder path
folder_path = r'/kaggle/input/veteran-titles/VetTrain_Transcripts'

# Function to extract the numerical part from the filename
def extract_pid(filename):
    base_name = os.path.splitext(filename)[0]
    return base_name.split('_')[0]  # Assuming filename is like "P001_transcript.csv"

# Get all CSV files in the folder and sort them numerically by filename
all_files = sorted(
    [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')],
    key=lambda x: int(extract_pid(os.path.basename(x))[1:])
)

In [4]:
# Initialize the final storage for combined question pairs
combined_data = []

# Process each file
for file_path in all_files:
    # Extract PID from filename
    pid = extract_pid(os.path.basename(file_path))
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    df = df[df['Type'] != 'IRR']  # Filter irrelevant rows

    # Initialize dialogue extraction
    current_dialogue = []
    current_question_id = None
    qid_counter = 1  # Start QID counter for each file

    # Extract question pairs with PID and QID
    for _, row in df.iterrows():
        if row['Type'].startswith('Q'):
            question_id = row['Type']
            if current_question_id is None:
                current_question_id = question_id
                current_dialogue = [row['Transcript']]
            elif question_id != current_question_id:
                combined_data.append({
                    'PID': pid,
                    'QID': f"Q{qid_counter}",
                    'Combined_Transcript': " ".join(current_dialogue)
                })
                qid_counter += 1
                current_dialogue = [row['Transcript']]
                current_question_id = question_id
            else:
                current_dialogue.append(row['Transcript'])
        else:
            current_dialogue.append(row['Transcript'])

    # Add the last dialogue for the file
    if current_dialogue:
        combined_data.append({
            'PID': pid,
            'QID': f"Q{qid_counter}",
            'Combined_Transcript': " ".join(current_dialogue)
        })

In [5]:
# Convert to a DataFrame
df_combined = pd.DataFrame(combined_data)

In [6]:
# Load the behavioral annotation codes 
behavior_file = r'/kaggle/input/ba-codes/Behavioral Annotation Codes.csv'
df_behavior = pd.read_csv(behavior_file)

# Merge behavioral codes
df_combined = df_combined.merge(df_behavior, on=['PID', 'QID'], how='left')

In [7]:
# Data preprocessing
import re

def clean_text(text):
    # lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Delete redundant Spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_combined['Cleaned_Transcript'] = df_combined['Combined_Transcript'].apply(clean_text)

In [8]:
# displaying the combined dataframe
df_combined.head(10)

Unnamed: 0,PID,QID,Combined_Transcript,Degree of Explanation,Cleaned_Transcript
0,P001,Q1,"Interviewer: Good. I just uh, I uh, always hav...",Succinct,interviewer good i just uh i uh always have it...
1,P001,Q2,"Interviewer: Okay, good deal. Interviewer: Yep...",Succinct,interviewer okay good deal interviewer yep i u...
2,P001,Q3,"Interviewer: Okay, so with all of your experie...",Under-explained,interviewer okay so with all of your experienc...
3,P001,Q4,"Interviewer: So, was that an easy transition f...",Under-explained,interviewer so was that an easy transition fro...
4,P001,Q5,Interviewer: So when we're talking about stren...,Succinct,interviewer so when were talking about strengt...
5,P001,Q6,"Interviewer: Yeah, yeah, no, I get it, I get i...",Succinct,interviewer yeah yeah no i get it i get it so ...
6,P001,Q7,Interviewer: So how did you- Interviewer: What...,Comprehensive,interviewer so how did you interviewer what wh...
7,P001,Q8,"Interviewer: Okay. Good. So, are you and your ...",Succinct,interviewer okay good so are you and your wife...
8,P001,Q9,"Interviewer: Sure. Yeah. So, when, ah, when ar...",Over-explained,interviewer sure yeah so when ah when are you ...
9,P001,Q10,"Interviewer: Yeah, I think that that is justif...",Succinct,interviewer yeah i think that that is justifie...


In [9]:
df_combined.shape

(287, 5)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag, word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:
# Count Vectorizer
vectorizer = CountVectorizer()
count_vectorizer = vectorizer.fit_transform(df_combined['Cleaned_Transcript'])
print("Feature names:", vectorizer.get_feature_names_out())
print(count_vectorizer.toarray())
np.shape(count_vectorizer.toarray())

Feature names: ['01' '0ne' '10' ... 'zone' 'zoom' 'zooming']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(287, 5659)

In [12]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(df_combined['Cleaned_Transcript'])
print(tfidf_features.toarray())
np.shape(tfidf_features.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.05015529 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


(287, 5659)

## Adding the extracted features to the main dataframe

In [13]:
def add_tfidf_features(df, text_column, max_features=500):
    """
    Adds TF-IDF features to the dataset.

    Parameters:
        df (pd.DataFrame): Input dataframe containing the text data.
        text_column (str): Name of the column containing text data.
        max_features (int): Maximum number of TF-IDF features to generate (default=500).
    
    Returns:
        pd.DataFrame: Dataframe with TF-IDF features added.
    """
    # Initialize TF-IDF Vectorizer
    tfidf = TfidfVectorizer(max_features=max_features)
    
    # Fit and transform the text data
    tfidf_matrix = tfidf.fit_transform(df[text_column])
    
    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out(), index=df.index)
    
    # Concatenate the TF-IDF features with the original DataFrame
    df_with_tfidf = pd.concat([df, tfidf_df], axis=1)
    
    return df_with_tfidf

In [14]:
# Example usage
df_combined = add_tfidf_features(df_combined, text_column="Cleaned_Transcript", max_features=1000)
df_combined.head()

Unnamed: 0,PID,QID,Combined_Transcript,Degree of Explanation,Cleaned_Transcript,15,20,ability,able,about,...,yet,you,youd,youll,young,your,youre,yourself,youve,zoom
0,P001,Q1,"Interviewer: Good. I just uh, I uh, always hav...",Succinct,interviewer good i just uh i uh always have it...,0.0,0.0,0.0,0.0,0.086414,...,0.0,0.158448,0.0,0.0,0.0,0.0,0.035163,0.062281,0.0,0.0
1,P001,Q2,"Interviewer: Okay, good deal. Interviewer: Yep...",Succinct,interviewer okay good deal interviewer yep i u...,0.0,0.0,0.0,0.0,0.134805,...,0.0,0.154487,0.0,0.0,0.0,0.292035,0.0,0.0,0.0,0.0
2,P001,Q3,"Interviewer: Okay, so with all of your experie...",Under-explained,interviewer okay so with all of your experienc...,0.0,0.0,0.0,0.0,0.072895,...,0.0,0.150369,0.0,0.0,0.0,0.067679,0.044494,0.0,0.0,0.0
3,P001,Q4,"Interviewer: So, was that an easy transition f...",Under-explained,interviewer so was that an easy transition fro...,0.0,0.0,0.0,0.0,0.0,...,0.0,0.104521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,P001,Q5,Interviewer: So when we're talking about stren...,Succinct,interviewer so when were talking about strengt...,0.0,0.0,0.0,0.0,0.035298,...,0.0,0.364069,0.0,0.0,0.0,0.032772,0.0,0.0,0.0,0.0


In [15]:
def add_pos_tags(df, text_column):
    """
    Adds POS tagging to the dataframe.
    
    Parameters:
        df (pd.DataFrame): Input dataframe containing text data.
        text_column (str): Name of the column containing text data.
    
    Returns:
        pd.DataFrame: Dataframe with added POS tags.
    """
    def pos_tags(text):
        tokens = word_tokenize(text)
        tags = pos_tag(tokens)
        return {tag: len([word for word, pos in tags if pos == tag]) for tag in set([pos for _, pos in tags])}
    
    df['POS_Tags'] = df[text_column].apply(pos_tags)
    return df

In [16]:
df_combined = add_pos_tags(df_combined, text_column="Cleaned_Transcript")
df_combined.head(10)

Unnamed: 0,PID,QID,Combined_Transcript,Degree of Explanation,Cleaned_Transcript,15,20,ability,able,about,...,you,youd,youll,young,your,youre,yourself,youve,zoom,POS_Tags
0,P001,Q1,"Interviewer: Good. I just uh, I uh, always hav...",Succinct,interviewer good i just uh i uh always have it...,0.0,0.0,0.0,0.0,0.086414,...,0.158448,0.0,0.0,0.0,0.0,0.035163,0.062281,0.0,0.0,"{'TO': 7, 'VBN': 2, 'WP': 1, 'CC': 14, 'DT': 1..."
1,P001,Q2,"Interviewer: Okay, good deal. Interviewer: Yep...",Succinct,interviewer okay good deal interviewer yep i u...,0.0,0.0,0.0,0.0,0.134805,...,0.154487,0.0,0.0,0.0,0.292035,0.0,0.0,0.0,0.0,"{'TO': 2, 'VBN': 3, 'WRB': 1, 'CC': 8, 'DT': 1..."
2,P001,Q3,"Interviewer: Okay, so with all of your experie...",Under-explained,interviewer okay so with all of your experienc...,0.0,0.0,0.0,0.0,0.072895,...,0.150369,0.0,0.0,0.0,0.067679,0.044494,0.0,0.0,0.0,"{'TO': 7, 'WP': 2, 'WRB': 5, 'CC': 5, 'DT': 21..."
3,P001,Q4,"Interviewer: So, was that an easy transition f...",Under-explained,interviewer so was that an easy transition fro...,0.0,0.0,0.0,0.0,0.0,...,0.104521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 1, 'VBN': 3, 'WRB': 2, 'CC': 4..."
4,P001,Q5,Interviewer: So when we're talking about stren...,Succinct,interviewer so when were talking about strengt...,0.0,0.0,0.0,0.0,0.035298,...,0.364069,0.0,0.0,0.0,0.032772,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 3, 'WRB': 3, 'CC': 9, 'DT': 14..."
5,P001,Q6,"Interviewer: Yeah, yeah, no, I get it, I get i...",Succinct,interviewer yeah yeah no i get it i get it so ...,0.0,0.0,0.0,0.0,0.0,...,0.092078,0.0,0.0,0.0,0.082885,0.054491,0.0,0.0,0.0,"{'TO': 3, 'WP': 1, 'VBN': 1, 'WRB': 1, 'CC': 4..."
6,P001,Q7,Interviewer: So how did you- Interviewer: What...,Comprehensive,interviewer so how did you interviewer what wh...,0.0,0.0,0.0,0.0,0.057687,...,0.19833,0.0,0.0,0.0,0.026779,0.070422,0.0,0.0,0.0,"{'TO': 14, 'WP': 6, 'VBN': 1, 'WRB': 3, 'CC': ..."
7,P001,Q8,"Interviewer: Okay. Good. So, are you and your ...",Succinct,interviewer okay good so are you and your wife...,0.0,0.0,0.0,0.0831,0.0,...,0.035474,0.0,0.0,0.0,0.047899,0.0,0.0,0.0,0.0,"{'PRP$': 4, 'TO': 4, 'RB': 9, 'RP': 1, 'WRB': ..."
8,P001,Q9,"Interviewer: Sure. Yeah. So, when, ah, when ar...",Over-explained,interviewer sure yeah so when ah when are you ...,0.0,0.0,0.0,0.0,0.0,...,0.232737,0.0,0.0,0.0,0.044893,0.029514,0.0,0.042132,0.0,"{'TO': 16, 'VBN': 4, 'WRB': 10, 'CC': 12, 'FW'..."
9,P001,Q10,"Interviewer: Yeah, I think that that is justif...",Succinct,interviewer yeah i think that that is justifie...,0.0,0.0,0.0,0.0,0.038553,...,0.079527,0.0,0.0,0.104115,0.035794,0.047064,0.0,0.0,0.0,"{'TO': 8, 'WP': 3, 'VBN': 2, 'WRB': 2, 'CC': 9..."


In [17]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def add_sentiment_scores(df, text_column):
    """
    Adds sentiment scores as separate columns to the dataframe.
    
    Parameters:
        df (pd.DataFrame): Input dataframe containing text data.
        text_column (str): Name of the column containing text data.
    
    Returns:
        pd.DataFrame: Dataframe with added sentiment scores as separate columns.
    """
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = df[text_column].apply(lambda text: sia.polarity_scores(text))
    
    # Create separate columns for each sentiment score
    df['Sentiment_Neg'] = sentiment_scores.apply(lambda score: score['neg'])
    df['Sentiment_Neu'] = sentiment_scores.apply(lambda score: score['neu'])
    df['Sentiment_Pos'] = sentiment_scores.apply(lambda score: score['pos'])
    df['Sentiment_Compound'] = sentiment_scores.apply(lambda score: score['compound'])
    
    return df

In [18]:
# Adding sentiment scores as separate columns to the dataframe
df_combined = add_sentiment_scores(df_combined, text_column="Cleaned_Transcript")

# Example output for the first row
print(df_combined[['Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos', 'Sentiment_Compound']].iloc[0])

Sentiment_Neg         0.0070
Sentiment_Neu         0.8210
Sentiment_Pos         0.1720
Sentiment_Compound    0.9936
Name: 0, dtype: float64


In [19]:
df_combined.drop(['QID', 'Combined_Transcript'], axis=1, inplace=True)
df_combined.head()

Unnamed: 0,PID,Degree of Explanation,Cleaned_Transcript,15,20,ability,able,about,above,absolutely,...,your,youre,yourself,youve,zoom,POS_Tags,Sentiment_Neg,Sentiment_Neu,Sentiment_Pos,Sentiment_Compound
0,P001,Succinct,interviewer good i just uh i uh always have it...,0.0,0.0,0.0,0.0,0.086414,0.0,0.0,...,0.0,0.035163,0.062281,0.0,0.0,"{'TO': 7, 'VBN': 2, 'WP': 1, 'CC': 14, 'DT': 1...",0.007,0.821,0.172,0.9936
1,P001,Succinct,interviewer okay good deal interviewer yep i u...,0.0,0.0,0.0,0.0,0.134805,0.0,0.0,...,0.292035,0.0,0.0,0.0,0.0,"{'TO': 2, 'VBN': 3, 'WRB': 1, 'CC': 8, 'DT': 1...",0.036,0.919,0.045,-0.0889
2,P001,Under-explained,interviewer okay so with all of your experienc...,0.0,0.0,0.0,0.0,0.072895,0.0,0.0,...,0.067679,0.044494,0.0,0.0,0.0,"{'TO': 7, 'WP': 2, 'WRB': 5, 'CC': 5, 'DT': 21...",0.011,0.904,0.085,0.9286
3,P001,Under-explained,interviewer so was that an easy transition fro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 1, 'VBN': 3, 'WRB': 2, 'CC': 4...",0.036,0.91,0.054,0.5568
4,P001,Succinct,interviewer so when were talking about strengt...,0.0,0.0,0.0,0.0,0.035298,0.0,0.0,...,0.032772,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 3, 'WRB': 3, 'CC': 9, 'DT': 14...",0.028,0.828,0.144,0.9653


In [20]:
from sentence_transformers import SentenceTransformer
import numpy as np

def add_word_embeddings(df, text_column, model_name='all-MiniLM-L6-v2'):
    """
    Adds word embeddings to the dataframe using Sentence Transformers.
    
    Parameters:
        df (pd.DataFrame): Input dataframe containing text data.
        text_column (str): Name of the column containing text data.
        model_name (str): Sentence Transformer model name.
    
    Returns:
        pd.DataFrame: Dataframe with added embeddings.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(df[text_column].tolist(), show_progress_bar=True)
    df['Embeddings'] = list(embeddings)
    return df

In [21]:
df_combined = add_word_embeddings(df_combined, text_column="Cleaned_Transcript")
df_combined.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Unnamed: 0,PID,Degree of Explanation,Cleaned_Transcript,15,20,ability,able,about,above,absolutely,...,youre,yourself,youve,zoom,POS_Tags,Sentiment_Neg,Sentiment_Neu,Sentiment_Pos,Sentiment_Compound,Embeddings
0,P001,Succinct,interviewer good i just uh i uh always have it...,0.0,0.0,0.0,0.0,0.086414,0.0,0.0,...,0.035163,0.062281,0.0,0.0,"{'TO': 7, 'VBN': 2, 'WP': 1, 'CC': 14, 'DT': 1...",0.007,0.821,0.172,0.9936,"[-0.08247561, 0.009922183, 0.033036105, 0.0038..."
1,P001,Succinct,interviewer okay good deal interviewer yep i u...,0.0,0.0,0.0,0.0,0.134805,0.0,0.0,...,0.0,0.0,0.0,0.0,"{'TO': 2, 'VBN': 3, 'WRB': 1, 'CC': 8, 'DT': 1...",0.036,0.919,0.045,-0.0889,"[-0.029192012, 0.056575354, 0.078753695, 0.020..."
2,P001,Under-explained,interviewer okay so with all of your experienc...,0.0,0.0,0.0,0.0,0.072895,0.0,0.0,...,0.044494,0.0,0.0,0.0,"{'TO': 7, 'WP': 2, 'WRB': 5, 'CC': 5, 'DT': 21...",0.011,0.904,0.085,0.9286,"[-0.042129416, 0.026093012, 0.020127049, 0.027..."
3,P001,Under-explained,interviewer so was that an easy transition fro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 1, 'VBN': 3, 'WRB': 2, 'CC': 4...",0.036,0.91,0.054,0.5568,"[-0.067918584, 0.021701401, 0.03339226, -0.006..."
4,P001,Succinct,interviewer so when were talking about strengt...,0.0,0.0,0.0,0.0,0.035298,0.0,0.0,...,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 3, 'WRB': 3, 'CC': 9, 'DT': 14...",0.028,0.828,0.144,0.9653,"[0.017702455, 0.050047435, 0.011827803, -0.014..."


# Classifying between Under-Explained and Succinct

## Cleaning Dataset

In [22]:
# creating a dataset for classification
df_new = df_combined.copy()
df_new.head()

Unnamed: 0,PID,Degree of Explanation,Cleaned_Transcript,15,20,ability,able,about,above,absolutely,...,youre,yourself,youve,zoom,POS_Tags,Sentiment_Neg,Sentiment_Neu,Sentiment_Pos,Sentiment_Compound,Embeddings
0,P001,Succinct,interviewer good i just uh i uh always have it...,0.0,0.0,0.0,0.0,0.086414,0.0,0.0,...,0.035163,0.062281,0.0,0.0,"{'TO': 7, 'VBN': 2, 'WP': 1, 'CC': 14, 'DT': 1...",0.007,0.821,0.172,0.9936,"[-0.08247561, 0.009922183, 0.033036105, 0.0038..."
1,P001,Succinct,interviewer okay good deal interviewer yep i u...,0.0,0.0,0.0,0.0,0.134805,0.0,0.0,...,0.0,0.0,0.0,0.0,"{'TO': 2, 'VBN': 3, 'WRB': 1, 'CC': 8, 'DT': 1...",0.036,0.919,0.045,-0.0889,"[-0.029192012, 0.056575354, 0.078753695, 0.020..."
2,P001,Under-explained,interviewer okay so with all of your experienc...,0.0,0.0,0.0,0.0,0.072895,0.0,0.0,...,0.044494,0.0,0.0,0.0,"{'TO': 7, 'WP': 2, 'WRB': 5, 'CC': 5, 'DT': 21...",0.011,0.904,0.085,0.9286,"[-0.042129416, 0.026093012, 0.020127049, 0.027..."
3,P001,Under-explained,interviewer so was that an easy transition fro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 1, 'VBN': 3, 'WRB': 2, 'CC': 4...",0.036,0.91,0.054,0.5568,"[-0.067918584, 0.021701401, 0.03339226, -0.006..."
4,P001,Succinct,interviewer so when were talking about strengt...,0.0,0.0,0.0,0.0,0.035298,0.0,0.0,...,0.0,0.0,0.0,0.0,"{'TO': 5, 'WP': 3, 'WRB': 3, 'CC': 9, 'DT': 14...",0.028,0.828,0.144,0.9653,"[0.017702455, 0.050047435, 0.011827803, -0.014..."


In [23]:
df_new = df_new[df_new['Degree of Explanation'].isin(['Under-explained', 'Succinct'])]
df_new['DOE_Label'] = df_new['Degree of Explanation'].map({'Under-explained': 0, 'Succinct': 1})
df_new.head()

Unnamed: 0,PID,Degree of Explanation,Cleaned_Transcript,15,20,ability,able,about,above,absolutely,...,yourself,youve,zoom,POS_Tags,Sentiment_Neg,Sentiment_Neu,Sentiment_Pos,Sentiment_Compound,Embeddings,DOE_Label
0,P001,Succinct,interviewer good i just uh i uh always have it...,0.0,0.0,0.0,0.0,0.086414,0.0,0.0,...,0.062281,0.0,0.0,"{'TO': 7, 'VBN': 2, 'WP': 1, 'CC': 14, 'DT': 1...",0.007,0.821,0.172,0.9936,"[-0.08247561, 0.009922183, 0.033036105, 0.0038...",1
1,P001,Succinct,interviewer okay good deal interviewer yep i u...,0.0,0.0,0.0,0.0,0.134805,0.0,0.0,...,0.0,0.0,0.0,"{'TO': 2, 'VBN': 3, 'WRB': 1, 'CC': 8, 'DT': 1...",0.036,0.919,0.045,-0.0889,"[-0.029192012, 0.056575354, 0.078753695, 0.020...",1
2,P001,Under-explained,interviewer okay so with all of your experienc...,0.0,0.0,0.0,0.0,0.072895,0.0,0.0,...,0.0,0.0,0.0,"{'TO': 7, 'WP': 2, 'WRB': 5, 'CC': 5, 'DT': 21...",0.011,0.904,0.085,0.9286,"[-0.042129416, 0.026093012, 0.020127049, 0.027...",0
3,P001,Under-explained,interviewer so was that an easy transition fro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,"{'TO': 5, 'WP': 1, 'VBN': 3, 'WRB': 2, 'CC': 4...",0.036,0.91,0.054,0.5568,"[-0.067918584, 0.021701401, 0.03339226, -0.006...",0
4,P001,Succinct,interviewer so when were talking about strengt...,0.0,0.0,0.0,0.0,0.035298,0.0,0.0,...,0.0,0.0,0.0,"{'TO': 5, 'WP': 3, 'WRB': 3, 'CC': 9, 'DT': 14...",0.028,0.828,0.144,0.9653,"[0.017702455, 0.050047435, 0.011827803, -0.014...",1


In [24]:
df_new.drop(['Degree of Explanation'], axis=1, inplace=True)

In [25]:
df_new.shape

(130, 1009)

In [26]:
# checking for null values
df_new.isna().sum()

PID                   0
Cleaned_Transcript    0
15                    0
20                    0
ability               0
                     ..
Sentiment_Neu         0
Sentiment_Pos         0
Sentiment_Compound    0
Embeddings            0
DOE_Label             0
Length: 1009, dtype: int64

In [27]:
pos_tags_df = pd.json_normalize(df_new['POS_Tags'])
pos_tags_df.fillna(0, inplace=True)  # Replace NaN with 0

In [28]:
pos_tags_df.shape

(130, 33)

In [29]:
# checking for Null values
pos_tags_df.isna().sum().sum()

0

In [30]:
# Check indices of both DataFrames
print(df_new.index)
print(pos_tags_df.index)

Index([  0,   1,   2,   3,   4,   5,   7,   9,  10,  11,
       ...
       257, 266, 270, 271, 272, 279, 281, 284, 285, 286],
      dtype='int64', length=130)
RangeIndex(start=0, stop=130, step=1)


In [31]:
# Reset indices before concatenation
df_new = df_new.reset_index(drop=True)
pos_tags_df = pos_tags_df.reset_index(drop=True)

In [32]:
df_new = pd.concat([df_new, pos_tags_df], axis=1)
df_new.drop(columns=['POS_Tags'], inplace=True)

In [33]:
df_new.isna().sum()

PID                   0
Cleaned_Transcript    0
15                    0
20                    0
ability               0
                     ..
UH                    0
NNP                   0
''                    0
FW                    0
POS                   0
Length: 1041, dtype: int64

In order to filter the extracted features, we also need to handle the `Embeddings` column in such a way that each value corresponds to a single feature column.

In [34]:
# Expand Embeddings list into individual columns
embeddings_df = pd.DataFrame(df_new['Embeddings'].to_list(), index=df_new.index)
embeddings_df.columns = [f'Embedding_{i}' for i in range(embeddings_df.shape[1])]

In [35]:
embeddings_df.isna().sum()

Embedding_0      0
Embedding_1      0
Embedding_2      0
Embedding_3      0
Embedding_4      0
                ..
Embedding_379    0
Embedding_380    0
Embedding_381    0
Embedding_382    0
Embedding_383    0
Length: 384, dtype: int64

In [36]:
df_new = pd.concat([df_new, embeddings_df], axis=1)
df_new.drop(columns=['Embeddings'], inplace=True)
df_new.head()

Unnamed: 0,PID,Cleaned_Transcript,15,20,ability,able,about,above,absolutely,access,...,Embedding_374,Embedding_375,Embedding_376,Embedding_377,Embedding_378,Embedding_379,Embedding_380,Embedding_381,Embedding_382,Embedding_383
0,P001,interviewer good i just uh i uh always have it...,0.0,0.0,0.0,0.0,0.086414,0.0,0.0,0.0,...,0.016676,-0.014764,0.008555,0.033396,-0.043216,0.002666,0.038463,0.003999,-0.043158,0.0397
1,P001,interviewer okay good deal interviewer yep i u...,0.0,0.0,0.0,0.0,0.134805,0.0,0.0,0.0,...,-0.1151,-0.059945,-0.027655,0.005605,-0.002344,-0.084586,0.06714,-0.086758,-0.083039,0.033452
2,P001,interviewer okay so with all of your experienc...,0.0,0.0,0.0,0.0,0.072895,0.0,0.0,0.0,...,-0.044904,0.088308,0.04541,0.036618,-0.042384,-0.000323,0.05007,0.018606,-0.082964,0.005994
3,P001,interviewer so was that an easy transition fro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.01346,-0.025747,0.005615,0.049796,-0.101872,0.001147,0.017912,-0.04954,-0.002439,0.002425
4,P001,interviewer so when were talking about strengt...,0.0,0.0,0.0,0.0,0.035298,0.0,0.0,0.0,...,0.048642,0.073907,0.025127,0.080058,0.021308,0.00408,0.038495,0.015446,-0.09864,-0.013695


In [37]:
df_new.shape

(130, 1424)

In [38]:
# creating a list of feature columns
feature_columns = (
    ['Sentiment_Neg', 'Sentiment_Pos', 'Sentiment_Neu', 'Sentiment_Compound'] + 
    pos_tags_df.columns.tolist() + 
    embeddings_df.columns.tolist()
)

In [39]:
from sklearn.feature_selection import mutual_info_classif

def select_features_with_mutual_info(df, feature_columns, target_column, k_values):
    """
    Selects top features for multiple values of k based on mutual information.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        feature_columns (list): List of feature column names.
        target_column (str): Name of the target column.
        k_values (list): List of different k values to experiment with.
    
    Returns:
        dict: Dictionary where keys are k values and values are lists of top k features.
    """
    X = df[feature_columns]
    y = df[target_column]
    
    # Calculate mutual information scores once
    mi_scores = mutual_info_classif(X, y, random_state=42)
    feature_scores = pd.Series(mi_scores, index=feature_columns)
    
    # Sort features by mutual information scores in descending order
    sorted_features = feature_scores.sort_values(ascending=False).index.tolist()
    
    # Generate top k features for all k values
    results = {k: sorted_features[:k] for k in k_values}
    
    return results

In [40]:
filtered_features = select_features_with_mutual_info(
    df_new, 
    feature_columns=feature_columns, 
    target_column='DOE_Label', 
    k_values=[100, 200, 400, 800]
)

In [41]:
# Print results for each k
for k, features in filtered_features.items():
    print(f"Top {k} features: {features} \n")

Top 100 features: ['Embedding_332', 'Embedding_331', 'Embedding_230', 'IN', 'Embedding_383', 'Embedding_88', 'Embedding_133', 'Embedding_97', 'WRB', 'Embedding_155', 'Embedding_114', 'Embedding_330', 'Embedding_1', 'Embedding_247', 'Embedding_316', 'Embedding_62', 'Embedding_12', 'Embedding_182', 'Embedding_336', 'Embedding_170', 'Embedding_44', 'VBP', 'Embedding_30', 'Embedding_52', 'Embedding_128', 'Embedding_364', 'Embedding_94', 'Embedding_314', 'Embedding_68', 'Embedding_226', 'Embedding_272', 'Embedding_166', 'Embedding_80', 'Embedding_131', 'JJS', 'Embedding_0', 'Embedding_359', 'Embedding_278', 'NN', 'Embedding_76', 'Embedding_312', 'Embedding_346', 'Embedding_255', 'JJ', 'Embedding_129', 'Embedding_85', 'Embedding_291', 'Embedding_150', 'Embedding_117', 'Embedding_2', 'Embedding_242', 'Embedding_57', 'Embedding_326', 'Embedding_49', 'Embedding_307', 'Embedding_308', 'Embedding_32', 'Embedding_96', 'Embedding_327', 'Embedding_199', 'Embedding_379', 'Embedding_317', 'Embedding_1

## Splitting the Dataset

In [42]:
from sklearn.model_selection import GroupKFold

def participant_independent_split(df, feature_columns, target_column, group_column, n_splits=5):
    """
    Splits the dataset into participant-independent folds using GroupKFold.
    
    Parameters:
        df (pd.DataFrame): The input dataset containing features, target, and group columns.
        feature_columns (list): List of column names to be used as features.
        target_column (str): Name of the target column.
        group_column (str): Name of the column used for grouping (e.g., Participant ID).
        n_splits (int): Number of folds (default is 5).
    
    Returns:
        list of tuples: Each tuple contains (X_train, X_test, y_train, y_test) for one fold.
    """
    # Extract features, target, and groups
    X = df[feature_columns]
    y = df[target_column]
    groups = df[group_column]
    
    # Initialize GroupKFold
    gkf = GroupKFold(n_splits=n_splits)
    
    # Store splits
    splits = []
    for train_idx, test_idx in gkf.split(X, y, groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        splits.append((X_train, X_test, y_train, y_test))
    
    return splits

In [43]:
target_column = 'DOE_Label'  # Replace with the actual target column name
group_column = 'PID'  # Replace with the participant ID column name

# Call the function
folds = participant_independent_split(df_new, feature_columns, target_column, group_column, n_splits=5)

# Display train and test sets for each fold
for i, (X_train, X_test, y_train, y_test) in enumerate(folds):
    print(f"Fold {i+1}")
    print("Train PIDs:", df_new[group_column].iloc[X_train.index].unique())
    print("Test PIDs:", df_new[group_column].iloc[X_test.index].unique())
    print("-" * 50)

Fold 1
Train PIDs: ['P001' 'P002' 'P004' 'P006' 'P007' 'P008' 'P010' 'P011' 'P012' 'P013'
 'P014' 'P016' 'P017' 'P019' 'P023' 'P024' 'P027' 'P028' 'P029' 'P030'
 'P031' 'P032' 'P033' 'P037' 'P038' 'P039' 'P041']
Test PIDs: ['P003' 'P005' 'P009' 'P018' 'P025' 'P036']
--------------------------------------------------
Fold 2
Train PIDs: ['P003' 'P004' 'P005' 'P006' 'P007' 'P008' 'P009' 'P010' 'P011' 'P012'
 'P014' 'P016' 'P017' 'P018' 'P019' 'P023' 'P024' 'P025' 'P027' 'P028'
 'P030' 'P032' 'P036' 'P037' 'P038' 'P039' 'P041']
Test PIDs: ['P001' 'P002' 'P013' 'P029' 'P031' 'P033']
--------------------------------------------------
Fold 3
Train PIDs: ['P001' 'P002' 'P003' 'P005' 'P007' 'P008' 'P009' 'P011' 'P012' 'P013'
 'P014' 'P016' 'P018' 'P019' 'P023' 'P024' 'P025' 'P027' 'P028' 'P029'
 'P031' 'P032' 'P033' 'P036' 'P037' 'P039']
Test PIDs: ['P004' 'P006' 'P010' 'P017' 'P030' 'P038' 'P041']
--------------------------------------------------
Fold 4
Train PIDs: ['P001' 'P002' 'P003' 'P004

In [44]:
# check the number of samples in each fold
for i, (X_train, X_test, y_train, y_test) in enumerate(folds):
    print(f"Fold {i+1}:")
    print(f"  Train sample shapes: {X_train.shape}")
    print(f"  Test samples shapes: {X_test.shape}")
    print("-" * 50)

Fold 1:
  Train sample shapes: (104, 421)
  Test samples shapes: (26, 421)
--------------------------------------------------
Fold 2:
  Train sample shapes: (104, 421)
  Test samples shapes: (26, 421)
--------------------------------------------------
Fold 3:
  Train sample shapes: (104, 421)
  Test samples shapes: (26, 421)
--------------------------------------------------
Fold 4:
  Train sample shapes: (104, 421)
  Test samples shapes: (26, 421)
--------------------------------------------------
Fold 5:
  Train sample shapes: (104, 421)
  Test samples shapes: (26, 421)
--------------------------------------------------


## Running Tree-Based ML Models

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score

def evaluate_tree_models(folds, k_features, feature_columns, target_column, top_features_dict):
    """
    Evaluates three tree-based models (Decision Tree, Random Forest, Gradient Boosting)
    with hyperparameter tuning for each k value and identifies the best model.

    Parameters:
        folds (list of tuples): Participant-independent splits (X_train, X_test, y_train, y_test).
        k_features (list): List of k values representing the number of top features to use.
        feature_columns (list): List of feature column names.
        target_column (str): Name of the target column.
        top_features_dict (dict): Dictionary mapping each k value to the list of top features.

    Returns:
        dict: Results for each k value containing the best model and its performance metrics.
    """
    # Define the models and their hyperparameter grids
    models = {
        "Decision Tree": {
            "model": DecisionTreeClassifier(random_state=42),
            "param_grid": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]},
        },
        "Random Forest": {
            "model": RandomForestClassifier(random_state=42),
            "param_grid": {
                "n_estimators": [50, 100, 200],
                "max_depth": [3, 5, 10, None],
                "min_samples_split": [2, 5, 10],
            },
        },
        "Gradient Boosting": {
            "model": GradientBoostingClassifier(random_state=42),
            "param_grid": {
                "n_estimators": [50, 100, 200],
                "learning_rate": [0.01, 0.1, 0.2],
                "max_depth": [3, 5, 10],
            },
        },
    }

    results = {}

    # Iterate over each k value
    for k in k_features:
        print(f"Evaluating models for top {k} features...")
        top_features = top_features_dict[k]  # Get the top k features

        best_model_name = None
        best_model = None
        best_accuracy = 0
        best_balanced_accuracy = 0
        best_params = None

        # Iterate over models
        for model_name, model_info in models.items():
            print(f"Training {model_name}...")
            total_accuracy = 0
            total_balanced_accuracy = 0

            # Iterate over folds
            for X_train, X_test, y_train, y_test in folds:
                # Filter top k features for the current fold
                X_train_k = X_train[top_features]
                X_test_k = X_test[top_features]

                # Perform grid search
                grid_search = GridSearchCV(
                    model_info["model"],
                    model_info["param_grid"],
                    scoring="accuracy",
                    cv=3,
                    n_jobs=-1,
                )
                grid_search.fit(X_train_k, y_train)

                # Evaluate on test set
                best_estimator = grid_search.best_estimator_
                y_pred = best_estimator.predict(X_test_k)
                accuracy = accuracy_score(y_test, y_pred)
                balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

                # Accumulate scores
                total_accuracy += accuracy
                total_balanced_accuracy += balanced_accuracy

            # Average scores over all folds
            avg_accuracy = total_accuracy / len(folds)
            avg_balanced_accuracy = total_balanced_accuracy / len(folds)

            print(f"{model_name}: Accuracy={avg_accuracy:.4f}, Balanced Accuracy={avg_balanced_accuracy:.4f} \n")

            # Update best model if this model performs better
            if (avg_accuracy > best_accuracy) and (avg_balanced_accuracy > best_balanced_accuracy):
                best_model_name = model_name
                best_model = grid_search.best_estimator_
                best_accuracy = avg_accuracy
                best_balanced_accuracy = avg_balanced_accuracy
                best_params = grid_search.best_params_

        # Store results for this k value
        results[k] = {
            "Best Model": best_model_name,
            "Best Accuracy": best_accuracy,
            "Best Balanced Accuracy": best_balanced_accuracy,
            "Best Parameters": best_params,
            "Best Model Object": best_model,
        }

    return results

In [46]:
k_values_list = list(filtered_features.keys())
print(k_values_list)

[100, 200, 400, 800]


In [47]:
# Example call
results = evaluate_tree_models(
    folds=folds,
    k_features=k_values_list,
    feature_columns=feature_columns,
    target_column=target_column,
    top_features_dict=filtered_features
)

# Display the best model for each k
for k, result in results.items():
    print(f"Top {k} features:")
    print(f"Best Model: {result['Best Model']}")
    print(f"Best Accuracy: {result['Best Accuracy']:.4f}")
    print(f"Best Balanced Accuracy: {result['Best Balanced Accuracy']:.4f}")
    print(f"Best Parameters: {result['Best Parameters']}")
    print("\n")

Evaluating models for top 100 features...
Training Decision Tree...


  pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling par

Decision Tree: Accuracy=0.7769, Balanced Accuracy=0.5491 

Training Random Forest...
Random Forest: Accuracy=0.8231, Balanced Accuracy=0.5000 

Training Gradient Boosting...
Gradient Boosting: Accuracy=0.8231, Balanced Accuracy=0.5614 

Evaluating models for top 200 features...
Training Decision Tree...
Decision Tree: Accuracy=0.7692, Balanced Accuracy=0.5391 

Training Random Forest...
Random Forest: Accuracy=0.8231, Balanced Accuracy=0.5000 

Training Gradient Boosting...
Gradient Boosting: Accuracy=0.8154, Balanced Accuracy=0.4955 

Evaluating models for top 400 features...
Training Decision Tree...
Decision Tree: Accuracy=0.7308, Balanced Accuracy=0.4848 

Training Random Forest...
Random Forest: Accuracy=0.8231, Balanced Accuracy=0.5000 

Training Gradient Boosting...
Gradient Boosting: Accuracy=0.8231, Balanced Accuracy=0.5000 

Evaluating models for top 800 features...
Training Decision Tree...
Decision Tree: Accuracy=0.7385, Balanced Accuracy=0.4698 

Training Random Forest...


In [48]:
import joblib

# Iterate over results to save the best model for each k
for k, result in results.items():
    best_model = result["Best Model Object"]  # Get the best model object
    filename = f"partc_best_model_top_{k}_features.pkl"  # Define a filename
    joblib.dump(best_model, filename)  # Save the model
    print(f"Saved best model for top {k} features as '{filename}'.")

Saved best model for top 100 features as 'partc_best_model_top_100_features.pkl'.
Saved best model for top 200 features as 'partc_best_model_top_200_features.pkl'.
Saved best model for top 400 features as 'partc_best_model_top_400_features.pkl'.
Saved best model for top 800 features as 'partc_best_model_top_800_features.pkl'.


## Running Deep Learning Models

In order to run deep learning models such as `Conv1D` and `LSTM`, we first need to scale and reshape the input dataset because these models require their inputs to be in a specific format and shape.

In [49]:
from sklearn.preprocessing import MinMaxScaler

def prepare_data_for_dl(X, top_features):
    """
    Scales and reshapes data for Conv1D and LSTM models.

    Parameters:
        X (pd.DataFrame): Input features.
        top_features (list): List of selected top features.

    Returns:
        np.ndarray: Scaled and reshaped data.
    """
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X[top_features])
    
    # Reshape for Conv1D and LSTM: (samples, timesteps, features)
    X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)
    return X_reshaped

The above function will be used before training and evaluating the models on each participant-independent fold.

In [50]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from sklearn.metrics import accuracy_score, balanced_accuracy_score

def train_and_evaluate_dl_models(folds, top_features_dict, model_type="Conv1D"):
    """
    Trains and evaluates Conv1D or LSTM models on participant-independent folds without hyperparameter tuning.
    Dataset loading is handled via TensorFlow tensor slices.

    Parameters:
        folds (list of tuples): Train-test splits from GroupKFold.
        top_features_dict (dict): Dictionary of top features.
        model_type (str): "Conv1D" or "LSTM".

    Returns:
        dict: Results for each k value containing the model and its performance metrics.
    """
    results = {}

    for k, top_features in top_features_dict.items():
        print(f"Evaluating models for top {k} features...")
        
        best_model = None
        best_accuracy = 0
        best_balanced_accuracy = 0

        # Iterate over folds
        for fold, (X_train, X_test, y_train, y_test) in enumerate(folds):
            print(f"Training on Fold {fold + 1}...")

            # Filter dataset for top features
            X_train = X_train[top_features]
            X_test = X_test[top_features]

            # Create TensorFlow datasets
            train_ds = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values)).batch(32).prefetch(tf.data.AUTOTUNE)
            test_ds = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values)).batch(32).prefetch(tf.data.AUTOTUNE)

            # Prepare input shape
            input_shape = (len(top_features), 1)  # (timesteps, features)

            # Build model
            if model_type == "Conv1D":
                model = Sequential([
                    Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape),
                    MaxPooling1D(pool_size=2),
                    Dropout(0.2),
                    Flatten(),
                    Dense(32, activation='relu'),
                    Dense(1, activation='sigmoid')
                ])
            elif model_type == "LSTM":
                model = Sequential([
                    LSTM(64, return_sequences=True, input_shape=input_shape),
                    Dropout(0.2),
                    LSTM(32),
                    Dense(1, activation='sigmoid')
                ])
            else:
                raise ValueError("Invalid model_type. Choose 'Conv1D' or 'LSTM'.")

            # Compile model
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

            # Train model
            model.fit(train_ds, epochs=20, verbose=0)
            print("Evaluating on the test dataset....")

            # Evaluate on test set
            y_pred = (model.predict(test_ds) > 0.5).astype(int)
            acc = accuracy_score(y_test, y_pred)
            bal_acc = balanced_accuracy_score(y_test, y_pred)

            print(f"Fold {fold + 1}: Accuracy={acc:.4f}, Balanced Accuracy={bal_acc:.4f}\n")

            # Update best model if this model performs better
            if acc > best_accuracy and bal_acc > best_balanced_accuracy:
                best_model = model
                best_accuracy = acc
                best_balanced_accuracy = bal_acc

        # Store results for this k value
        results[k] = {
            "Best Model": best_model,
            "Best Accuracy": best_accuracy,
            "Best Balanced Accuracy": best_balanced_accuracy,
        }
        print(f"Best Model for top {k} features: Accuracy={best_accuracy:.4f}, Balanced Accuracy={best_balanced_accuracy:.4f}\n")

    return results

In [51]:
# Call the function for Conv1D
conv1d_results = train_and_evaluate_dl_models(
    folds=folds,
    top_features_dict=filtered_features,
    model_type="Conv1D"
)

# Display results
for k, result in conv1d_results.items():
    print(f"Top {k} features:")
    print(f"Best Model: {result['Best Model']}")
    print(f"Best Accuracy: {result['Best Accuracy']:.4f}")
    print(f"Best Balanced Accuracy: {result['Best Balanced Accuracy']:.4f}\n")

Evaluating models for top 100 features...
Training on Fold 1...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1733247660.426472     207 service.cc:145] XLA service 0x77fd38007750 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733247660.426523     207 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1733247663.000624     207 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step
Fold 1: Accuracy=0.7308, Balanced Accuracy=0.4318

Training on Fold 2...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Fold 2: Accuracy=0.8077, Balanced Accuracy=0.4773

Training on Fold 3...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
Fold 3: Accuracy=0.8462, Balanced Accuracy=0.6667

Training on Fold 4...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Fold 4: Accuracy=0.8077, Balanced Accuracy=0.4773

Training on Fold 5...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
Fold 5: Accuracy=0.8077, Balanced Accuracy=0.5000

Best Model for top 100 features: Accuracy=0.8462, Balanced Accuracy=0.6667

Evaluating models for top 200 features...
Training on Fold 1...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
Fold 1: Accuracy=0.7308, Balanced Accuracy=0.4318

Training on Fold 2...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
Fold 2: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 3...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Fold 3: Accuracy=0.8077, Balanced Accuracy=0.5833

Training on Fold 4...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Fold 4: Accuracy=0.8077, Balanced Accuracy=0.5795

Training on Fold 5...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
Fold 5: Accuracy=0.8077, Balanced Accuracy=0.5000

Best Model for top 200 features: Accuracy=0.8462, Balanced Accuracy=0.5000

Evaluating models for top 400 features...
Training on Fold 1...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step
Fold 1: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 2...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
Fold 2: Accuracy=0.8077, Balanced Accuracy=0.4773

Training on Fold 3...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Fold 3: Accuracy=0.7692, Balanced Accuracy=0.5000

Training on Fold 4...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Fold 4: Accuracy=0.7692, Balanced Accuracy=0.4545

Training on Fold 5...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Fold 5: Accuracy=0.7692, Balanced Accuracy=0.4762

Best Model for top 400 features: Accuracy=0.8462, Balanced Accuracy=0.5000

Evaluating models for top 800 features...
Training on Fold 1...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step
Fold 1: Accuracy=0.8846, Balanced Accuracy=0.6250

Training on Fold 2...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
Fold 2: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 3...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
Fold 3: Accuracy=0.8462, Balanced Accuracy=0.6667

Training on Fold 4...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
Fold 4: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 5...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Fold 5: Accuracy=0.8077, Balanced Accuracy=0.5000

Best Model for top 800 features: Accuracy=0.8846, Balanced Accuracy=0.6250

Top 100 features:
Best Model: <Sequential name=sequential_2, built=True>
Best Accuracy: 0.8462
Best Balanced Accuracy: 0.6667

Top 200 features:
Best Model: <Sequential name=sequential_6, built=True>
Best Accuracy: 0.8462
Best Balanced Accuracy: 0.5000

Top 400 features:
Best Model: <Sequential name=sequential_10, built=True>
Best Accuracy: 0.8462
Best Balanced Accuracy: 0.5000

Top 800 features:
Best Model: <Sequential name=sequential_15, built=True>
Best Accuracy: 0.8846
Best Balanced Accuracy: 0.6250



In [52]:
# Call the function for LSTM
LSTM_results = train_and_evaluate_dl_models(
    folds=folds,
    top_features_dict=filtered_features,
    model_type="LSTM"
)

# Display results
for k, result in LSTM_results.items():
    print(f"Top {k} features:")
    print(f"Best LSTM Model: {result['Best Model']}")
    print(f"Best Accuracy: {result['Best Accuracy']:.4f}")
    print(f"Best Balanced Accuracy: {result['Best Balanced Accuracy']:.4f}\n")

Evaluating models for top 100 features...
Training on Fold 1...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Fold 1: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 2...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
Fold 2: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 3...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
Fold 3: Accuracy=0.7692, Balanced Accuracy=0.5000

Training on Fold 4...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
Fold 4: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 5...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Fold 5: Accuracy=0.8077, Balanced Accuracy=0.5000

Best Model for top 100 features: Accuracy=0.8462, Balanced Accuracy=0.5000

Evaluating models for top 200 features...
Training on Fold 1...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
Fold 1: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 2...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
Fold 2: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 3...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
Fold 3: Accuracy=0.7692, Balanced Accuracy=0.5000

Training on Fold 4...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
Fold 4: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 5...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Fold 5: Accuracy=0.8077, Balanced Accuracy=0.5000

Best Model for top 200 features: Accuracy=0.8462, Balanced Accuracy=0.5000

Evaluating models for top 400 features...
Training on Fold 1...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
Fold 1: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 2...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
Fold 2: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 3...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
Fold 3: Accuracy=0.7692, Balanced Accuracy=0.5000

Training on Fold 4...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
Fold 4: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 5...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
Fold 5: Accuracy=0.8077, Balanced Accuracy=0.5000

Best Model for top 400 features: Accuracy=0.8462, Balanced Accuracy=0.5000

Evaluating models for top 800 features...
Training on Fold 1...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
Fold 1: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 2...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
Fold 2: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 3...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
Fold 3: Accuracy=0.7692, Balanced Accuracy=0.5000

Training on Fold 4...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step
Fold 4: Accuracy=0.8462, Balanced Accuracy=0.5000

Training on Fold 5...


  super().__init__(**kwargs)


Evaluating on the test dataset....
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
Fold 5: Accuracy=0.8077, Balanced Accuracy=0.5000

Best Model for top 800 features: Accuracy=0.8462, Balanced Accuracy=0.5000

Top 100 features:
Best LSTM Model: <Sequential name=sequential_20, built=True>
Best Accuracy: 0.8462
Best Balanced Accuracy: 0.5000

Top 200 features:
Best LSTM Model: <Sequential name=sequential_25, built=True>
Best Accuracy: 0.8462
Best Balanced Accuracy: 0.5000

Top 400 features:
Best LSTM Model: <Sequential name=sequential_30, built=True>
Best Accuracy: 0.8462
Best Balanced Accuracy: 0.5000

Top 800 features:
Best LSTM Model: <Sequential name=sequential_35, built=True>
Best Accuracy: 0.8462
Best Balanced Accuracy: 0.5000



In [53]:
# Directory to save models
save_dir_conv1d = "/kaggle/working/saved_models_conv1D"
os.makedirs(save_dir_conv1d, exist_ok=True)

# Iterate through the results dictionary
for k, result in conv1d_results.items():
    best_model = result["Best Model"]
    model_path = os.path.join(save_dir_conv1d, f"partc_best_dl_conv1d_model_top_{k}_features.pkl")
    joblib.dump(best_model, model_path)  # Save the model
    print(f"Saved best model for top {k} features at: {model_path}")

Saved best model for top 100 features at: /kaggle/working/saved_models_conv1D/partc_best_dl_conv1d_model_top_100_features.pkl
Saved best model for top 200 features at: /kaggle/working/saved_models_conv1D/partc_best_dl_conv1d_model_top_200_features.pkl
Saved best model for top 400 features at: /kaggle/working/saved_models_conv1D/partc_best_dl_conv1d_model_top_400_features.pkl
Saved best model for top 800 features at: /kaggle/working/saved_models_conv1D/partc_best_dl_conv1d_model_top_800_features.pkl


In [54]:
# Directory to save models
save_dir_lstm = "/kaggle/working/saved_models_LSTM"
os.makedirs(save_dir_lstm, exist_ok=True)

# Iterate through the results dictionary
for k, result in LSTM_results.items():
    best_model = result["Best Model"]
    model_path = os.path.join(save_dir_lstm, f"partc_best_dl_lstm_model_top_{k}_features.pkl")
    joblib.dump(best_model, model_path)  # Save the model
    print(f"Saved best model for top {k} features at: {model_path}")

Saved best model for top 100 features at: /kaggle/working/saved_models_LSTM/partc_best_dl_lstm_model_top_100_features.pkl
Saved best model for top 200 features at: /kaggle/working/saved_models_LSTM/partc_best_dl_lstm_model_top_200_features.pkl
Saved best model for top 400 features at: /kaggle/working/saved_models_LSTM/partc_best_dl_lstm_model_top_400_features.pkl
Saved best model for top 800 features at: /kaggle/working/saved_models_LSTM/partc_best_dl_lstm_model_top_800_features.pkl
