### Preprocessing

In [1]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk import word_tokenize, pos_tag
from collections import Counter
import numpy as np

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

!pip install vaderSentiment

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
# Load transcripts

# Get Participant IDs
# P2 is missing
transcripts = pd.read_csv("transcripts.csv", header=None)
participant_ids = transcripts[0]
print("Participant IDs:", participant_ids.tolist())

Participant IDs: ['p1', 'p10', 'p11', 'p12', 'p13', 'p14', 'p15', 'p16', 'p17', 'p20', 'p21', 'p22', 'p24', 'p25', 'p27', 'p29', 'p3', 'p30', 'p31', 'p32', 'p33', 'p34', 'p35', 'p37', 'p4', 'p42', 'p43', 'p44', 'p45', 'p47', 'p48', 'p49', 'p5', 'p50', 'p52', 'p53', 'p55', 'p56', 'p57', 'p58', 'p59', 'p6', 'p60', 'p61', 'p62', 'p63', 'p64', 'p65', 'p66', 'p67', 'p69', 'p7', 'p70', 'p71', 'p72', 'p73', 'p74', 'p76', 'p77', 'p78', 'p79', 'p8', 'p80', 'p81', 'p83', 'p84', 'p85', 'p86', 'p89', 'pp1', 'pp10', 'pp11', 'pp12', 'pp13', 'pp14', 'pp15', 'pp16', 'pp17', 'pp20', 'pp21', 'pp22', 'pp24', 'pp25', 'pp27', 'pp29', 'pp3', 'pp30', 'pp31', 'pp32', 'pp33', 'pp34', 'pp35', 'pp37', 'pp4', 'pp42', 'pp43', 'pp44', 'pp45', 'pp47', 'pp48', 'pp49', 'pp5', 'pp50', 'pp52', 'pp53', 'pp55', 'pp56', 'pp57', 'pp58', 'pp59', 'pp6', 'pp60', 'pp61', 'pp62', 'pp63', 'pp64', 'pp65', 'pp66', 'pp67', 'pp69', 'pp7', 'pp70', 'pp71', 'pp72', 'pp73', 'pp74', 'pp76', 'pp77', 'pp78', 'pp79', 'pp8', 'pp80', 'pp81', '

In [3]:
# Function to extract only interviewee text
def extract_text(text, sep='Interviewee:'):
    segments = text.split('|')
    interviewee_lines = [seg.replace(sep, '').strip()
                         for seg in segments if seg.startswith(sep)]
    return ' '.join(interviewee_lines)

# Apply extraction to all transcripts
interviewee_texts = transcripts[1].astype(str).apply(lambda x: extract_text(x, sep='Interviewee:'))
print("Extracted interviewee texts:", interviewee_texts.head())

Extracted interviewee texts: 0    Im pretty good. ok  uhm  so have you looked at...
1    Great  how about you? I'm a little [???] by th...
2    Uhh  I’m a junior at MIT  uhh I’m double major...
3    I'm good  how are you? Ok  so  I'm a Junior at...
4    Good. Ok  umm  I'm currently a junior at M.I.T...
Name: 1, dtype: object


In [4]:
# Combine IDs and extracted text into a single DataFrame
extracted_df = pd.DataFrame({
    'participant_id': participant_ids,
    'interviewee_text': interviewee_texts
})

print(extracted_df.head())

  participant_id                                   interviewee_text
0             p1  Im pretty good. ok  uhm  so have you looked at...
1            p10  Great  how about you? I'm a little [???] by th...
2            p11  Uhh  I’m a junior at MIT  uhh I’m double major...
3            p12  I'm good  how are you? Ok  so  I'm a Junior at...
4            p13  Good. Ok  umm  I'm currently a junior at M.I.T...


In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import nltk

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
#filler_words = {'uh', 'uhh',' uhm', 'umm', 'like', 'ah', 'oh', 'hmm', 'mhhmm', 'ok', 'okay'}
#stop_words.update(filler_words)

lemmatizer = WordNetLemmatizer()
# Helper: map nltk POS tags to wordnet POS
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    # POS tagging
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize with POS
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(tokens)

In [6]:
# Apply preprocessing for texts
processed_interviewee_texts = interviewee_texts.apply(preprocess_text)
print("Processed interviewee texts:", processed_interviewee_texts.head())

Processed interviewee texts: 0    im pretty good ok uhm look resume alright gues...
1    great little resurgence hurricane yeah half wa...
2    uhh junior mit uhh double major management bio...
3    good ok junior mit major mechanical engineerin...
4    good ok umm currently junior study electrical ...
Name: 1, dtype: object


In [7]:
# Add to DataFrame
extracted_df["processed_interviewee_text"] = processed_interviewee_texts
extracted_df.head()

Unnamed: 0,participant_id,interviewee_text,processed_interviewee_text
0,p1,Im pretty good. ok uhm so have you looked at...,im pretty good ok uhm look resume alright gues...
1,p10,Great how about you? I'm a little [???] by th...,great little resurgence hurricane yeah half wa...
2,p11,Uhh I’m a junior at MIT uhh I’m double major...,uhh junior mit uhh double major management bio...
3,p12,I'm good how are you? Ok so I'm a Junior at...,good ok junior mit major mechanical engineerin...
4,p13,Good. Ok umm I'm currently a junior at M.I.T...,good ok umm currently junior study electrical ...


In [8]:
import re

#Order Dataframe -->  p1,pp1,p2,pp2.....
def split_id_interleaved(pid):
    pid = str(pid)
    match = re.match(r'([A-Za-z]+)(\d+)$', pid)
    if match:
        prefix = match.group(1)
        number = int(match.group(2))
        return number, prefix  # number FIRST, prefix SECOND
    else:
        return float('inf'), pid  # fallback

# Apply new ordering
extracted_df = extracted_df.sort_values(
    by="participant_id",
    key=lambda col: col.apply(split_id_interleaved)
).reset_index(drop=True)

print(extracted_df.head())


  participant_id                                   interviewee_text  \
0             p1  Im pretty good. ok  uhm  so have you looked at...   
1            pp1  Umm  so I’m a computer science and biology maj...   
2             p3  Good. You? Yeah  sorry  I was- think I was lat...   
3            pp3  Hah.. uh yeah..so.. uhm I'm chase and I guess ...   
4             p4  Pretty good. Uhh  so I'm a junior uhh studying...   

                          processed_interviewee_text  
0  im pretty good ok uhm look resume alright gues...  
1  umm computer science biology major mit junior ...  
2  good yeah sorry think late totally like mess t...  
3  hah uh yeah uhm chase guess uh aww man okay do...  
4  pretty good uhh junior uhh study aeronautics a...  


In [9]:
scores = pd.read_csv("scores.csv")
scores.rename(columns={'Participant': 'participant_id'}, inplace=True)

# Add Score Columns to Dataframe
score_cols = ["Overall", "Excited"]

#Merged Dataframe has scores appended
merged_df = extracted_df.merge(scores, on="participant_id", how="inner")
merged_df = merged_df[[col for col in merged_df.columns if col not in score_cols] + score_cols]

print(merged_df.head())


  participant_id                                   interviewee_text  \
0             p1  Im pretty good. ok  uhm  so have you looked at...   
1            pp1  Umm  so I’m a computer science and biology maj...   
2             p3  Good. You? Yeah  sorry  I was- think I was lat...   
3            pp3  Hah.. uh yeah..so.. uhm I'm chase and I guess ...   
4             p4  Pretty good. Uhh  so I'm a junior uhh studying...   

                          processed_interviewee_text   Overall   Excited  
0  im pretty good ok uhm look resume alright gues...  5.297316  5.043890  
1  umm computer science biology major mit junior ...  4.951525  4.223099  
2  good yeah sorry think late totally like mess t...  4.414892  5.601586  
3  hah uh yeah uhm chase guess uh aww man okay do...  5.011269  5.894389  
4  pretty good uhh junior uhh study aeronautics a...  4.494494  4.261343  


In [10]:
from sklearn.model_selection import train_test_split

# Data Split
merged_df_copy = merged_df.copy()

# Labels
y = merged_df_copy[['Overall', 'Excited']]

# Features
X = merged_df_copy.drop(['interviewee_text', 'Overall', 'Excited'], axis=1)

# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

### Feature Engineering

In [11]:
# 1️. Extract TF-IDF Features
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,3))

# Fit on training set, transform both
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train["processed_interviewee_text"])
X_test_tfidf  = tfidf_vectorizer.transform(X_test["processed_interviewee_text"])

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test  TF-IDF shape:", X_test_tfidf.shape)

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

Train TF-IDF shape: (110, 500)
Test  TF-IDF shape: (28, 500)


In [12]:
from nltk import word_tokenize, pos_tag
from collections import Counter

# 2️. Part-of-Speech (POS) distribution features
def pos_distribution(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    counts = Counter(tag for word, tag in pos_tags)
    total = sum(counts.values())
    # Normalize counts to get distribution
    for tag in counts:
        counts[tag] /= total
    return counts

# Training
pos_features_train_df = X_train["processed_interviewee_text"].apply(pos_distribution).apply(pd.Series).fillna(0)

# Test
pos_features_test_df = X_test["processed_interviewee_text"].apply(pos_distribution).apply(pd.Series).fillna(0)
pos_features_test_df = pos_features_test_df.reindex(columns=pos_features_train_df.columns, fill_value=0)

print("POS train feature shape:", pos_features_train_df.shape)
print("POS train feature columns:", pos_features_train_df.columns.tolist())


POS train feature shape: (110, 31)
POS train feature columns: ['NN', 'JJ', 'VBD', 'RB', 'CD', 'VBN', 'VB', 'NNS', 'VBP', 'CC', 'IN', 'MD', 'VBZ', 'UH', 'VBG', 'NNP', 'JJS', 'WDT', 'RBR', 'DT', 'FW', 'RBS', 'RP', 'WP$', 'JJR', 'TO', 'PRP', 'WRB', 'WP', 'EX', 'NNPS']


In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertModel
import torch

#3 VaderSentiment Features
analyzer = SentimentIntensityAnalyzer()

def compute_sentiment_scores(text):
    """
    Compute VADER sentiment scores:
      - pos
      - neu
      - neg
      - compound (interpretable overall sentiment)
    """
    scores = analyzer.polarity_scores(text)
    return [
        scores["pos"],
        scores["neu"],
        scores["neg"],
        scores["compound"]
    ]

# Apply to training set
sentiment_train_list = X_train["processed_interviewee_text"].apply(compute_sentiment_scores)
sentiment_train_df = pd.DataFrame(
    sentiment_train_list.tolist(),
    columns=["sent_pos", "sent_neu", "sent_neg", "sent_compound"]
)

# Apply to test set
sentiment_test_list = X_test["processed_interviewee_text"].apply(compute_sentiment_scores)
sentiment_test_df = pd.DataFrame(
    sentiment_test_list.tolist(),
    columns=["sent_pos", "sent_neu", "sent_neg", "sent_compound"]
)

print("Train sentiment shape:", sentiment_train_df.shape)
print("Test  sentiment shape:", sentiment_test_df.shape)

Train sentiment shape: (110, 4)
Test  sentiment shape: (28, 4)


In [14]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# 4 BERT features
def get_bert_embedding(text):
    """
    Returns the mean-pooled BERT embedding (768-dim vector).
    """
    tokens = tokenizer(
        text,
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = bert_model(**tokens)

    # Extract last hidden states and mean-pool over tokens
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# BERT embeddings
bert_train_list = X_train["processed_interviewee_text"].apply(get_bert_embedding)
bert_train_df = pd.DataFrame(bert_train_list.tolist(), columns=[f"bert_{i}" for i in range(768)])
bert_test_list = X_test["processed_interviewee_text"].apply(get_bert_embedding)
bert_test_df = pd.DataFrame(bert_test_list.tolist(), columns=bert_train_df.columns)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [15]:
bert_train_df.head()

Unnamed: 0,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,-0.17006,-0.021468,0.612173,-0.022811,0.525601,-0.125874,0.210311,0.151814,-0.075346,-0.472012,...,-0.228448,-0.48336,0.211241,0.142209,-0.249702,0.151865,-0.366414,-0.207798,-0.060547,-0.166272
1,0.17197,0.034811,0.552739,0.043057,0.349407,-0.186388,0.161451,0.235328,-0.067839,-0.390464,...,-0.155009,-0.260521,0.212724,0.002184,-0.017616,0.142934,-0.323157,-0.264188,-0.039839,-0.226649
2,-0.32417,-0.065714,0.613833,-0.102116,0.31481,-0.24113,0.03939,0.182734,-0.069243,-0.379956,...,-0.318223,-0.504128,0.378146,0.264115,-0.192662,0.026738,-0.399988,-0.34645,-0.117779,-0.27039
3,-0.217727,0.100089,0.518102,-0.057981,0.367647,-0.205912,0.17798,0.128972,-0.090992,-0.43633,...,-0.202131,-0.566932,0.336202,0.152205,-0.244209,0.007454,-0.447997,-0.305615,-0.118987,-0.251138
4,-0.015781,-0.022592,0.548214,-0.131978,0.459008,-0.240705,0.226786,0.222311,-0.044434,-0.502745,...,-0.192404,-0.439915,0.238232,0.110655,-0.135923,0.097959,-0.325973,-0.343204,-0.042686,-0.198769


In [16]:
from scipy.sparse import hstack, csr_matrix

# Merge all features together

# Training features
X_train_combined = hstack([
    X_train_tfidf,
    csr_matrix(pos_features_train_df.values),
    csr_matrix(sentiment_train_df.values),
    csr_matrix(bert_train_df.values)
])

# Test features
X_test_combined = hstack([
    X_test_tfidf,
    csr_matrix(pos_features_test_df.reindex(columns=pos_features_train_df.columns, fill_value=0).values),
    csr_matrix(sentiment_test_df.values),
    csr_matrix(bert_test_df.values)
])

# For reverse feature name mapping (BERT)
feature_names = list(tfidf_feature_names) + list(pos_features_train_df.columns) + list(sentiment_train_df.columns) + list(bert_train_df.columns)

print("Train combined shape:", X_train_combined.shape)
print("Test  combined shape:", X_test_combined.shape)


Train combined shape: (110, 1303)
Test  combined shape: (28, 1303)


### Feature Selection (FS)

In [17]:
from sklearn.feature_selection import mutual_info_regression
import numpy as np

def select_top_k_mutual_info(X_train, X_test, y_train, feature_names, k=20, random_state=42):
    """
    Select top-k features by mutual information with y_train.

    Args:
        X_train: scipy sparse or dense matrix of training features
        X_test: scipy sparse or dense matrix of test features
        y_train: array-like target values for training set
        feature_names: list of feature names corresponding to columns in X_train
        k: number of top features to select
        random_state: random seed for mutual_info_regression

    Returns:
        X_train_top: training features (top-k)
        X_test_top: test features (top-k)
        top_features: list of tuples (feature_name, mutual_info_score)
    """
    # Convert sparse matrix to dense if necessary
    if hasattr(X_train, "toarray"):
        X_train_dense = X_train.toarray()
    else:
        X_train_dense = X_train

    mi_scores = mutual_info_regression(X_train_dense, y_train, random_state=random_state)
    mi_scores = np.array(mi_scores)

    top_k_idx = np.argsort(mi_scores)[-k:][::-1]

    # Select columns in train and test
    X_train_top = X_train[:, top_k_idx]
    X_test_top = X_test[:, top_k_idx]

    # Prepare feature info
    top_features = [(feature_names[i], mi_scores[i]) for i in top_k_idx]

    print(f"\nTop {k} features by mutual information:")
    for name, score in top_features:
        print(f"{name}: MI score={score:.4f}")

    return X_train_top, X_test_top, top_features


In [18]:
from scipy.stats import pearsonr
import numpy as np

def select_top_k_correlation(X_train, X_test, y_train, feature_names, k=20):
    """
    Select top-k features by absolute Pearson correlation with y_train.

    Args:
        X_train: scipy sparse or dense matrix of training features
        X_test: scipy sparse or dense matrix of test features
        y_train: array-like target values for training set
        feature_names: list of feature names corresponding to columns in X_train
        k: number of top features to select

    Returns:
        X_train_top: training features (top-k)
        X_test_top: test features (top-k)
        top_features: list of tuples (feature_name, correlation, direction)
    """
    correlations = []
    for i in range(X_train.shape[1]):
        col = X_train[:, i]
        # Convert sparse column to dense
        if hasattr(col, "toarray"):
            col = col.toarray().ravel()
        else:
            col = col.ravel()
        corr, _ = pearsonr(col, y_train)
        correlations.append(corr)

    correlations = np.array(correlations)
    top_k_idx = np.argsort(np.abs(correlations))[-k:][::-1]

    # Select columns in train and test
    X_train_top = X_train[:, top_k_idx]
    X_test_top = X_test[:, top_k_idx]

    # Prepare feature info
    top_features = [
        (feature_names[i], correlations[i], "positive" if correlations[i] > 0 else "negative")
        for i in top_k_idx
    ]

    print(f"\nTop {k} features by correlation:")
    for name, corr, direction in top_features:
        print(f"{name}: corr={corr:.3f}, {direction}")

    return X_train_top, X_test_top, top_features


Naming Convention (Feel free to rename this) :

'Default' features = Top k features selected based on Correlation \
MI features = Top k features selected based on MI

Each Feature Engineering Technique has 4 splits

Ex : \
X_train_tfidf_top_overall
 - Top TF-IDF features selected by correlation with the Overall score.

X_train_tfidf_topMI_overall
 - Top TF-IDF features selected using Mutual Information (MI) with the Overall score.

X_train_tfidf_top_excite:
 - Top TF-IDF features selected by correlation with the Excited score.

X_train_tfidf_topMI_excite:
 - Top TF-IDF features selected using Mutual Information (MI) with the Excited score.


#### FS - TFIDF

In [19]:
X_train_tfidf_top_overall, X_test_tfidf_top_overall, tfidf_top_features_overall = select_top_k_correlation(
    X_train_tfidf,
    X_test_tfidf,
    y_train["Overall"].values,
    list(tfidf_feature_names),
    k=30
)


Top 30 features by correlation:
think: corr=0.395, positive
bit: corr=0.389, positive
also like: corr=-0.337, negative
aspect: corr=0.336, positive
chemical: corr=0.331, positive
like um: corr=-0.318, negative
team player: corr=0.310, positive
student: corr=0.309, positive
little bit: corr=0.302, positive
player: corr=0.299, positive
try: corr=0.288, positive
person: corr=0.280, positive
company: corr=0.279, positive
relate: corr=0.278, positive
kind: corr=0.274, positive
lot: corr=0.268, positive
tell: corr=-0.266, negative
different: corr=0.264, positive
consult: corr=0.263, positive
come: corr=0.258, positive
like: corr=-0.256, negative
little: corr=0.255, positive
find: corr=0.254, positive
speak: corr=-0.253, negative
program: corr=0.251, positive
mention: corr=0.251, positive
leadership: corr=0.246, positive
like really: corr=-0.245, negative
back: corr=0.240, positive
like like: corr=-0.238, negative


In [20]:
X_train_tfidf_topMI_overall, X_test_tfidf_topMI_overall, tfidf_topMI_features_overall = select_top_k_mutual_info(
    X_train_tfidf,
    X_test_tfidf,
    y_train["Overall"].values,
    list(tfidf_feature_names),
    k=30
)


Top 30 features by mutual information:
bit: MI score=0.1272
back: MI score=0.1196
need: MI score=0.1119
another: MI score=0.1101
well: MI score=0.1021
year: MI score=0.1006
yeah: MI score=0.0927
plan: MI score=0.0926
question: MI score=0.0923
hmm: MI score=0.0912
dorm: MI score=0.0899
think get: MI score=0.0813
company: MI score=0.0789
use: MI score=0.0788
get: MI score=0.0787
thank: MI score=0.0781
really like: MI score=0.0781
team work: MI score=0.0775
generally: MI score=0.0768
future: MI score=0.0767
really: MI score=0.0747
happen: MI score=0.0724
communicate: MI score=0.0720
um junior: MI score=0.0717
think leader: MI score=0.0684
new: MI score=0.0680
leadership: MI score=0.0678
whole: MI score=0.0678
lot: MI score=0.0671
think: MI score=0.0668


In [21]:
X_train_tfidf_top_excite, X_test_tfidf_top_excite, tfidf_top_features_excite = select_top_k_correlation(
    X_train_tfidf,
    X_test_tfidf,
    y_train["Excited"].values,
    list(tfidf_feature_names),
    k=30
)


Top 30 features by correlation:
chemical: corr=0.315, positive
find: corr=0.303, positive
tell: corr=-0.293, negative
big: corr=-0.264, negative
back: corr=0.263, positive
sport: corr=-0.261, negative
try: corr=0.261, positive
player: corr=0.259, positive
aspect: corr=0.253, positive
quickly: corr=-0.251, negative
love: corr=0.250, positive
definitely: corr=0.248, positive
team player: corr=0.244, positive
really: corr=0.240, positive
aerospace: corr=-0.236, negative
important: corr=0.235, positive
know: corr=0.235, positive
year: corr=-0.235, negative
lot: corr=0.234, positive
may: corr=0.230, positive
bit: corr=0.229, positive
person: corr=0.229, positive
small: corr=-0.222, negative
people: corr=0.221, positive
excite: corr=0.216, positive
cool: corr=0.215, positive
last year: corr=-0.215, negative
least: corr=0.214, positive
school: corr=-0.213, negative
weaknesses: corr=-0.211, negative


In [22]:
X_train_tfidf_topMI_excite, X_test_tfidf_topMI_excite, tfidf_topMI_features_excite = select_top_k_mutual_info(
    X_train_tfidf,
    X_test_tfidf,
    y_train["Excited"].values,
    list(tfidf_feature_names),
    k=30
)


Top 30 features by mutual information:
junior: MI score=0.2001
much: MI score=0.1645
come: MI score=0.1255
actually: MI score=0.1237
learn: MI score=0.1154
look: MI score=0.1145
class: MI score=0.1142
probably: MI score=0.1133
um like: MI score=0.1111
bit: MI score=0.1088
mean: MI score=0.1025
organize: MI score=0.1007
really: MI score=0.0985
make sure: MI score=0.0960
sometimes: MI score=0.0932
definitely: MI score=0.0916
basically: MI score=0.0855
new: MI score=0.0843
campus: MI score=0.0841
towards: MI score=0.0768
interested: MI score=0.0763
team work: MI score=0.0738
set: MI score=0.0737
feel like: MI score=0.0735
love: MI score=0.0725
back: MI score=0.0704
school: MI score=0.0694
junior mit: MI score=0.0694
experience: MI score=0.0688
role: MI score=0.0686


#### FS - POS

In [23]:
X_train_pos_top_overall, X_test_pos_top_overall, pos_top_features_overall = select_top_k_correlation(
    pos_features_train_df.values,
    pos_features_test_df.values,
    y_train["Overall"].values,
    list(pos_features_train_df.columns),
    k=30
)


Top 30 features by correlation:
IN: corr=-0.345, negative
NNP: corr=-0.221, negative
NN: corr=0.199, positive
EX: corr=0.174, positive
VB: corr=0.159, positive
VBZ: corr=-0.158, negative
VBP: corr=0.155, positive
JJ: corr=-0.152, negative
DT: corr=0.149, positive
CC: corr=0.146, positive
RBR: corr=0.145, positive
JJS: corr=0.136, positive
WDT: corr=-0.123, negative
JJR: corr=-0.117, negative
FW: corr=-0.117, negative
WP: corr=-0.102, negative
VBN: corr=0.092, positive
UH: corr=0.078, positive
CD: corr=-0.065, negative
RB: corr=0.057, positive
VBD: corr=-0.057, negative
WP$: corr=0.048, positive
TO: corr=-0.047, negative
NNPS: corr=-0.046, negative
RBS: corr=0.042, positive
VBG: corr=0.038, positive
RP: corr=0.026, positive
MD: corr=0.025, positive
WRB: corr=0.019, positive
PRP: corr=-0.017, negative


In [24]:
X_train_pos_topMI_overall, X_test_pos_topMI_overall, pos_topMI_features_overall = select_top_k_mutual_info(
    pos_features_train_df.values,
    pos_features_test_df.values,
    y_train["Overall"].values,
    list(pos_features_train_df.columns),
    k=30
)



Top 30 features by mutual information:
DT: MI score=0.0934
NN: MI score=0.0810
TO: MI score=0.0695
CD: MI score=0.0592
FW: MI score=0.0587
JJS: MI score=0.0558
VBZ: MI score=0.0328
NNP: MI score=0.0322
WDT: MI score=0.0206
RP: MI score=0.0179
IN: MI score=0.0179
PRP: MI score=0.0148
NNPS: MI score=0.0061
RBS: MI score=0.0000
EX: MI score=0.0000
RBR: MI score=0.0000
WP$: MI score=0.0000
WRB: MI score=0.0000
JJR: MI score=0.0000
WP: MI score=0.0000
MD: MI score=0.0000
UH: MI score=0.0000
VBG: MI score=0.0000
VBP: MI score=0.0000
CC: MI score=0.0000
VB: MI score=0.0000
NNS: MI score=0.0000
VBN: MI score=0.0000
RB: MI score=0.0000
VBD: MI score=0.0000


In [25]:
X_train_pos_top_excite, X_test_pos_top_excite, pos_top_features_excite = select_top_k_correlation(
    pos_features_train_df.values,
    pos_features_test_df.values,
    y_train["Excited"].values,
    list(pos_features_train_df.columns),
    k=30
)



Top 30 features by correlation:
NNP: corr=-0.318, negative
VBP: corr=0.262, positive
VBZ: corr=-0.245, negative
CC: corr=0.221, positive
RB: corr=0.181, positive
WP: corr=-0.164, negative
CD: corr=-0.152, negative
DT: corr=0.151, positive
JJS: corr=0.149, positive
EX: corr=0.139, positive
RBR: corr=0.132, positive
VBD: corr=-0.118, negative
WRB: corr=0.103, positive
VB: corr=0.097, positive
UH: corr=0.097, positive
RBS: corr=0.096, positive
JJR: corr=-0.093, negative
IN: corr=-0.090, negative
VBG: corr=0.084, positive
NN: corr=-0.083, negative
JJ: corr=-0.078, negative
NNS: corr=0.076, positive
TO: corr=0.075, positive
NNPS: corr=0.072, positive
FW: corr=0.048, positive
WP$: corr=-0.034, negative
RP: corr=0.034, positive
VBN: corr=-0.026, negative
PRP: corr=0.025, positive
MD: corr=0.025, positive


In [26]:
X_train_pos_topMI_excite, X_test_pos_topMI_excite, pos_topMI_features_excite = select_top_k_mutual_info(
    pos_features_train_df.values,
    pos_features_test_df.values,
    y_train["Excited"].values,
    list(pos_features_train_df.columns),
    k=30
)



Top 30 features by mutual information:
RB: MI score=0.1710
VBZ: MI score=0.1479
IN: MI score=0.1067
VBD: MI score=0.0888
VBN: MI score=0.0868
UH: MI score=0.0841
VBG: MI score=0.0714
WDT: MI score=0.0633
VB: MI score=0.0358
CC: MI score=0.0345
RBR: MI score=0.0330
NNP: MI score=0.0221
DT: MI score=0.0190
PRP: MI score=0.0152
RBS: MI score=0.0122
JJ: MI score=0.0067
NNPS: MI score=0.0061
WRB: MI score=0.0059
FW: MI score=0.0000
WP: MI score=0.0000
EX: MI score=0.0000
TO: MI score=0.0000
JJR: MI score=0.0000
JJS: MI score=0.0000
WP$: MI score=0.0000
RP: MI score=0.0000
MD: MI score=0.0000
VBP: MI score=0.0000
NNS: MI score=0.0000
CD: MI score=0.0000


#### FS - VADERSentiment

k = 4 (only 4 classes exist for VaderSentiment)

In [27]:
X_train_sen_top_overall, X_test_sen_top_overall, sen_top_features_overall = select_top_k_correlation(
    sentiment_train_df.values,
    sentiment_test_df.values,
    y_train["Overall"].values,
    list(sentiment_train_df.columns),
    k=4
)


Top 4 features by correlation:
sent_neg: corr=-0.160, negative
sent_compound: corr=0.156, positive
sent_neu: corr=0.128, positive
sent_pos: corr=-0.059, negative


In [28]:
X_train_sen_topMI_overall, X_test_sen_topMI_overall, sen_topMI_features_overall = select_top_k_mutual_info(
    sentiment_train_df.values,
    sentiment_test_df.values,
    y_train["Overall"].values,
    list(sentiment_train_df.columns),
    k=4
)


Top 4 features by mutual information:
sent_pos: MI score=0.0753
sent_neg: MI score=0.0219
sent_compound: MI score=0.0000
sent_neu: MI score=0.0000


In [29]:
X_train_sen_top_excite, X_test_sen_top_excite, sen_top_features_excite = select_top_k_correlation(
    sentiment_train_df.values,
    sentiment_test_df.values,
    y_train["Excited"].values,
    list(sentiment_train_df.columns),
    k=4
)


Top 4 features by correlation:
sent_compound: corr=0.255, positive
sent_pos: corr=0.210, positive
sent_neg: corr=-0.189, negative
sent_neu: corr=-0.141, negative


In [30]:
X_train_sen_topMI_excite, X_test_sen_topMI_excite, sen_topMI_features_excite = select_top_k_mutual_info(
    sentiment_train_df.values,
    sentiment_test_df.values,
    y_train["Excited"].values,
    list(sentiment_train_df.columns),
    k=4
)


Top 4 features by mutual information:
sent_compound: MI score=0.0800
sent_neg: MI score=0.0323
sent_pos: MI score=0.0132
sent_neu: MI score=0.0000


#### FS - BERT

In [31]:
X_train_bert_top_overall, X_test_bert_top_overall,bert_top_features_overall = select_top_k_correlation(
    bert_train_df.values,
    bert_test_df.values,
    y_train["Overall"].values,
    list(bert_train_df.columns),
    k=30
)


Top 30 features by correlation:
bert_589: corr=-0.445, negative
bert_86: corr=0.440, positive
bert_725: corr=0.404, positive
bert_751: corr=0.393, positive
bert_340: corr=-0.375, negative
bert_672: corr=-0.369, negative
bert_512: corr=0.365, positive
bert_126: corr=-0.357, negative
bert_452: corr=-0.357, negative
bert_657: corr=0.356, positive
bert_451: corr=-0.354, negative
bert_361: corr=-0.354, negative
bert_446: corr=-0.353, negative
bert_345: corr=-0.345, negative
bert_658: corr=0.343, positive
bert_497: corr=-0.342, negative
bert_540: corr=0.334, positive
bert_97: corr=0.333, positive
bert_268: corr=0.327, positive
bert_134: corr=-0.327, negative
bert_575: corr=-0.326, negative
bert_650: corr=0.326, positive
bert_721: corr=-0.321, negative
bert_396: corr=-0.321, negative
bert_666: corr=-0.320, negative
bert_28: corr=0.316, positive
bert_217: corr=0.312, positive
bert_638: corr=0.311, positive
bert_576: corr=0.311, positive
bert_539: corr=-0.310, negative


In [32]:
X_train_bert_topMI_overall, X_test_bert_topMI_overall,bert_topMI_features_overall = select_top_k_mutual_info(
    bert_train_df.values,
    bert_test_df.values,
    y_train["Overall"].values,
    list(bert_train_df.columns),
    k=30
)


Top 30 features by mutual information:
bert_454: MI score=0.2190
bert_476: MI score=0.2096
bert_396: MI score=0.1925
bert_427: MI score=0.1906
bert_663: MI score=0.1843
bert_393: MI score=0.1720
bert_67: MI score=0.1719
bert_688: MI score=0.1684
bert_269: MI score=0.1676
bert_152: MI score=0.1669
bert_594: MI score=0.1662
bert_205: MI score=0.1623
bert_491: MI score=0.1609
bert_458: MI score=0.1588
bert_33: MI score=0.1562
bert_504: MI score=0.1544
bert_226: MI score=0.1541
bert_705: MI score=0.1513
bert_698: MI score=0.1488
bert_94: MI score=0.1482
bert_41: MI score=0.1468
bert_699: MI score=0.1446
bert_97: MI score=0.1437
bert_233: MI score=0.1424
bert_378: MI score=0.1406
bert_351: MI score=0.1375
bert_542: MI score=0.1354
bert_100: MI score=0.1352
bert_471: MI score=0.1352
bert_416: MI score=0.1347


In [33]:
X_train_bert_top_excite, X_test_bert_top_excite,bert_top_features_excite = select_top_k_correlation(
    bert_train_df.values,
    bert_test_df.values,
    y_train["Excited"].values,
    list(bert_train_df.columns),
    k=30
)


Top 30 features by correlation:
bert_589: corr=-0.394, negative
bert_653: corr=0.376, positive
bert_167: corr=0.359, positive
bert_217: corr=0.355, positive
bert_249: corr=-0.349, negative
bert_758: corr=-0.345, negative
bert_608: corr=0.341, positive
bert_486: corr=-0.340, negative
bert_381: corr=0.333, positive
bert_458: corr=-0.326, negative
bert_513: corr=-0.320, negative
bert_132: corr=-0.318, negative
bert_701: corr=-0.316, negative
bert_445: corr=-0.311, negative
bert_215: corr=-0.311, negative
bert_576: corr=0.311, positive
bert_30: corr=0.311, positive
bert_686: corr=-0.309, negative
bert_638: corr=0.309, positive
bert_697: corr=-0.300, negative
bert_534: corr=0.297, positive
bert_620: corr=0.296, positive
bert_426: corr=-0.295, negative
bert_578: corr=0.295, positive
bert_232: corr=0.295, positive
bert_693: corr=-0.294, negative
bert_704: corr=0.294, positive
bert_648: corr=0.292, positive
bert_273: corr=0.289, positive
bert_539: corr=-0.288, negative


In [34]:
X_train_bert_topMI_excite, X_test_bert_topMI_excite,bert_topMI_features_excite = select_top_k_mutual_info(
    bert_train_df.values,
    bert_test_df.values,
    y_train["Excited"].values,
    list(bert_train_df.columns),
    k=30
)


Top 30 features by mutual information:
bert_608: MI score=0.2559
bert_337: MI score=0.2551
bert_309: MI score=0.2338
bert_257: MI score=0.2096
bert_426: MI score=0.1897
bert_742: MI score=0.1889
bert_629: MI score=0.1801
bert_758: MI score=0.1800
bert_181: MI score=0.1718
bert_99: MI score=0.1666
bert_216: MI score=0.1630
bert_485: MI score=0.1629
bert_49: MI score=0.1621
bert_505: MI score=0.1618
bert_495: MI score=0.1613
bert_147: MI score=0.1604
bert_467: MI score=0.1586
bert_158: MI score=0.1578
bert_366: MI score=0.1552
bert_213: MI score=0.1544
bert_325: MI score=0.1522
bert_513: MI score=0.1520
bert_728: MI score=0.1493
bert_549: MI score=0.1481
bert_447: MI score=0.1456
bert_137: MI score=0.1448
bert_230: MI score=0.1431
bert_596: MI score=0.1414
bert_673: MI score=0.1413
bert_341: MI score=0.1411


#### Combined df Feature Selection

In [35]:
X_train_comb_top_overall, X_test_comb_top_overall,comb_top_features_overall = select_top_k_correlation(
    X_train_combined,
    X_test_combined,
    y_train["Overall"].values,
    feature_names,
    k=30
)


Top 30 features by correlation:
bert_589: corr=-0.445, negative
bert_86: corr=0.440, positive
bert_725: corr=0.404, positive
think: corr=0.395, positive
bert_751: corr=0.393, positive
bit: corr=0.389, positive
bert_340: corr=-0.375, negative
bert_672: corr=-0.369, negative
bert_512: corr=0.365, positive
bert_126: corr=-0.357, negative
bert_452: corr=-0.357, negative
bert_657: corr=0.356, positive
bert_451: corr=-0.354, negative
bert_361: corr=-0.354, negative
bert_446: corr=-0.353, negative
IN: corr=-0.345, negative
bert_345: corr=-0.345, negative
bert_658: corr=0.343, positive
bert_497: corr=-0.342, negative
also like: corr=-0.337, negative
aspect: corr=0.336, positive
bert_540: corr=0.334, positive
bert_97: corr=0.333, positive
chemical: corr=0.331, positive
bert_268: corr=0.327, positive
bert_134: corr=-0.327, negative
bert_575: corr=-0.326, negative
bert_650: corr=0.326, positive
bert_721: corr=-0.321, negative
bert_396: corr=-0.321, negative


In [36]:
X_train_comb_topMI_overall, X_test_comb_topMI_overall,comb_topMI_features_overall = select_top_k_mutual_info(
    X_train_combined,
    X_test_combined,
    y_train["Overall"].values,
    feature_names,
    k=30
)


Top 30 features by mutual information:
bert_454: MI score=0.2190
bert_476: MI score=0.2096
bert_396: MI score=0.1925
bert_427: MI score=0.1906
bert_663: MI score=0.1843
bert_67: MI score=0.1719
bert_393: MI score=0.1718
bert_688: MI score=0.1680
bert_269: MI score=0.1676
bert_152: MI score=0.1666
bert_594: MI score=0.1656
bert_205: MI score=0.1616
bert_491: MI score=0.1609
bert_458: MI score=0.1598
bert_33: MI score=0.1556
bert_226: MI score=0.1541
bert_504: MI score=0.1538
bert_705: MI score=0.1513
bert_698: MI score=0.1480
bert_94: MI score=0.1476
bert_41: MI score=0.1468
bert_699: MI score=0.1446
bert_97: MI score=0.1442
bert_233: MI score=0.1424
bert_378: MI score=0.1402
bert_351: MI score=0.1369
bert_416: MI score=0.1351
bert_471: MI score=0.1338
bert_542: MI score=0.1333
bit: MI score=0.1329


In [37]:
X_train_comb_top_excite, X_test_comb_top_excite,comb_top_features_excite = select_top_k_correlation(
    X_train_combined,
    X_test_combined,
    y_train["Excited"].values,
    feature_names,
    k=30
)


Top 30 features by correlation:
bert_589: corr=-0.394, negative
bert_653: corr=0.376, positive
bert_167: corr=0.359, positive
bert_217: corr=0.355, positive
bert_249: corr=-0.349, negative
bert_758: corr=-0.345, negative
bert_608: corr=0.341, positive
bert_486: corr=-0.340, negative
bert_381: corr=0.333, positive
bert_458: corr=-0.326, negative
bert_513: corr=-0.320, negative
NNP: corr=-0.318, negative
bert_132: corr=-0.318, negative
bert_701: corr=-0.316, negative
chemical: corr=0.315, positive
bert_445: corr=-0.311, negative
bert_215: corr=-0.311, negative
bert_576: corr=0.311, positive
bert_30: corr=0.311, positive
bert_686: corr=-0.309, negative
bert_638: corr=0.309, positive
find: corr=0.303, positive
bert_697: corr=-0.300, negative
bert_534: corr=0.297, positive
bert_620: corr=0.296, positive
bert_426: corr=-0.295, negative
bert_578: corr=0.295, positive
bert_232: corr=0.295, positive
bert_693: corr=-0.294, negative
bert_704: corr=0.294, positive


In [38]:
X_train_comb_topMI_excite, X_test_comb_topMI_excite,comb_topMI_features_excite = select_top_k_mutual_info(
    X_train_combined,
    X_test_combined,
    y_train["Excited"].values,
    feature_names,
    k=30
)


Top 30 features by mutual information:
bert_608: MI score=0.2559
bert_337: MI score=0.2551
bert_309: MI score=0.2338
bert_257: MI score=0.2096
junior: MI score=0.2001
bert_426: MI score=0.1897
bert_742: MI score=0.1889
bert_629: MI score=0.1801
bert_758: MI score=0.1800
bert_181: MI score=0.1718
RB: MI score=0.1707
bert_99: MI score=0.1666
much: MI score=0.1645
bert_216: MI score=0.1630
bert_485: MI score=0.1629
bert_49: MI score=0.1621
bert_505: MI score=0.1618
bert_495: MI score=0.1613
bert_147: MI score=0.1604
bert_467: MI score=0.1586
bert_158: MI score=0.1578
bert_366: MI score=0.1552
bert_213: MI score=0.1544
bert_325: MI score=0.1522
bert_513: MI score=0.1520
VBZ: MI score=0.1504
bert_728: MI score=0.1493
bert_549: MI score=0.1481
bert_447: MI score=0.1456
bert_137: MI score=0.1448


### Model Training and Evaluation

#### Tree Based Method - RandomForest

In [39]:
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
import numpy as np

def evaluate_rf_models_with_gridsearch(X_perf_selected, X_excite_selected, y_performance, y_excitement,
                                       X_perf_test, X_excite_test, y_perf_test, y_excite_test,
                                       k_folds=5, param_grid=None, random_state=42):
    """
    Evaluate Random Forest regressors using GridSearchCV on training data with K-Fold CV
    and then evaluate on separate test sets.
    """

    if param_grid is None:
        param_grid = {
            "n_estimators": [20, 30, 40],
            "max_depth": [3, 5, 10],
            "min_samples_split": [5, 10, 15],
            "min_samples_leaf": [2, 5, 10],
            "max_features": ['sqrt', 'log2', 0.5]
        }

    # --- Performance RF with GridSearchCV ---
    rf_perf_grid = GridSearchCV(
        RandomForestRegressor(random_state=random_state),
        param_grid=param_grid,
        cv=k_folds,
        n_jobs=-1
    )
    rf_perf_grid.fit(X_perf_selected, y_performance)
    best_perf_rf = rf_perf_grid.best_estimator_

    # CV metrics using out-of-fold predictions
    y_cv_pred_perf = cross_val_predict(best_perf_rf, X_perf_selected, y_performance, cv=k_folds)
    cv_perf_r = pearsonr(y_performance, y_cv_pred_perf)[0]
    # cv_perf_re = np.mean(np.abs(y_cv_pred_perf - y_performance) / y_performance)
    cv_perf_re = np.mean(np.abs(y_cv_pred_perf - y_performance) / np.max(y_performance))

    # Test metrics
    y_test_pred_perf = best_perf_rf.predict(X_perf_test)
    test_perf_r = pearsonr(y_perf_test, y_test_pred_perf)[0]
    # test_perf_re = np.mean(np.abs(y_test_pred_perf - y_perf_test) / y_perf_test)
    test_perf_re = np.mean(np.abs(y_test_pred_perf - y_perf_test) / np.max(y_perf_test))

    # --- Excitement RF with GridSearchCV ---
    rf_excite_grid = GridSearchCV(
        RandomForestRegressor(random_state=random_state),
        param_grid=param_grid,
        cv=k_folds,
        n_jobs=-1
    )
    rf_excite_grid.fit(X_excite_selected, y_excitement)
    best_excite_rf = rf_excite_grid.best_estimator_

    # CV metrics using out-of-fold predictions
    y_cv_pred_excite = cross_val_predict(best_excite_rf, X_excite_selected, y_excitement, cv=k_folds)
    cv_excite_r = pearsonr(y_excitement, y_cv_pred_excite)[0]
    # cv_excite_re = np.mean(np.abs(y_cv_pred_excite - y_excitement) / y_excitement)
    cv_excite_re = np.mean(np.abs(y_cv_pred_excite - y_excitement) / np.max(y_excitement))




    # Test metrics
    y_test_pred_excite = best_excite_rf.predict(X_excite_test)
    test_excite_r = pearsonr(y_excite_test, y_test_pred_excite)[0]
    # test_excite_re = np.mean(np.abs(y_test_pred_excite - y_excite_test) / y_excite_test)
    test_excite_re = np.mean(np.abs(y_test_pred_excite - y_excite_test) / np.max(y_excite_test))


    metrics = {
        "CV_Performance_r": cv_perf_r,
        "CV_Performance_RE": cv_perf_re,
        "Test_Performance_r": test_perf_r,
        "Test_Performance_RE": test_perf_re,
        "Perf_Best_Params": rf_perf_grid.best_params_,
        "CV_Excitement_r": cv_excite_r,
        "CV_Excitement_RE": cv_excite_re,
        "Test_Excitement_r": test_excite_r,
        "Test_Excitement_RE": test_excite_re,
        "Excite_Best_Params": rf_excite_grid.best_params_
    }

    print("=== Performance ===")
    print(f"CV r = {cv_perf_r:.3f}, RE = {cv_perf_re:.3f}")
    print(f"Test r = {test_perf_r:.3f}, RE = {test_perf_re:.3f}")
    print(f"Best Params: {rf_perf_grid.best_params_}\n")

    print("=== Excitement ===")
    print(f"CV r = {cv_excite_r:.3f}, RE = {cv_excite_re:.3f}")
    print(f"Test r = {test_excite_r:.3f}, RE = {test_excite_re:.3f}")
    print(f"Best Params: {rf_excite_grid.best_params_}")

    return metrics


In [40]:
metrics_rf_kfold_tfidf = evaluate_rf_models_with_gridsearch(
    X_train_tfidf_top_overall,
    X_train_tfidf_top_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_tfidf_top_overall,
    X_test_tfidf_top_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.542, RE = 0.067
Test r = 0.097, RE = 0.087
Best Params: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}

=== Excitement ===
CV r = 0.578, RE = 0.085
Test r = -0.099, RE = 0.114
Best Params: {'max_depth': 5, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 20}


In [41]:
metrics_rf_kfold_tfidf_MI = evaluate_rf_models_with_gridsearch(
    X_train_tfidf_topMI_overall,
    X_train_tfidf_topMI_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_tfidf_topMI_overall,
    X_test_tfidf_topMI_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.437, RE = 0.070
Test r = 0.214, RE = 0.082
Best Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 30}

=== Excitement ===
CV r = 0.442, RE = 0.090
Test r = 0.302, RE = 0.103
Best Params: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 20}


In [42]:
metrics_rf_kfold_pos = evaluate_rf_models_with_gridsearch(
    X_train_pos_top_overall,
    X_train_pos_top_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_pos_top_overall,
    X_test_pos_top_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.249, RE = 0.078
Test r = -0.042, RE = 0.082
Best Params: {'max_depth': 10, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 20}

=== Excitement ===
CV r = 0.249, RE = 0.096
Test r = 0.182, RE = 0.109
Best Params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 30}


In [43]:
metrics_rf_kfold_pos_MI = evaluate_rf_models_with_gridsearch(
    X_train_pos_topMI_overall,
    X_train_pos_topMI_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_pos_topMI_overall,
    X_test_pos_topMI_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.234, RE = 0.078
Test r = -0.080, RE = 0.085
Best Params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 20}

=== Excitement ===
CV r = 0.242, RE = 0.096
Test r = 0.306, RE = 0.105
Best Params: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 40}


In [44]:
metrics_rf_kfold_sen = evaluate_rf_models_with_gridsearch(
    X_train_sen_top_overall,
    X_train_sen_top_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_sen_top_overall,
    X_test_sen_top_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.103, RE = 0.082
Test r = 0.159, RE = 0.077
Best Params: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 40}

=== Excitement ===
CV r = 0.239, RE = 0.099
Test r = -0.006, RE = 0.107
Best Params: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 40}


In [45]:
metrics_rf_kfold_sen_MI = evaluate_rf_models_with_gridsearch(
    X_train_sen_topMI_overall,
    X_train_sen_topMI_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_sen_topMI_overall,
    X_test_sen_topMI_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.124, RE = 0.081
Test r = 0.148, RE = 0.078
Best Params: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 40}

=== Excitement ===
CV r = 0.236, RE = 0.101
Test r = 0.064, RE = 0.102
Best Params: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 40}


In [46]:
metrics_rf_kfold_bert = evaluate_rf_models_with_gridsearch(
    X_train_bert_top_overall,
    X_train_bert_top_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_bert_top_overall,
    X_test_bert_top_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.583, RE = 0.064
Test r = 0.259, RE = 0.079
Best Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 40}

=== Excitement ===
CV r = 0.487, RE = 0.088
Test r = 0.142, RE = 0.114
Best Params: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 40}


In [47]:
metrics_rf_kfold_bertMI = evaluate_rf_models_with_gridsearch(
    X_train_bert_topMI_overall,
    X_train_bert_topMI_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_bert_topMI_overall,
    X_test_bert_topMI_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.463, RE = 0.068
Test r = 0.135, RE = 0.082
Best Params: {'max_depth': 5, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 30}

=== Excitement ===
CV r = 0.451, RE = 0.089
Test r = 0.331, RE = 0.101
Best Params: {'max_depth': 10, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 30}


In [65]:
metrics_rf_kfold_comb = evaluate_rf_models_with_gridsearch(
    X_train_comb_top_overall,
    X_train_comb_top_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_comb_top_overall,
    X_test_comb_top_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.560, RE = 0.065
Test r = 0.186, RE = 0.080
Best Params: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 20}

=== Excitement ===
CV r = 0.486, RE = 0.088
Test r = 0.124, RE = 0.106
Best Params: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 40}


In [66]:
metrics_rf_kfold_combMI = evaluate_rf_models_with_gridsearch(
    X_train_comb_topMI_overall,
    X_train_comb_topMI_excite,
    y_train["Overall"].values,
    y_train["Excited"].values,
    X_test_comb_topMI_overall,
    X_test_comb_topMI_excite,
    y_test["Overall"].values,
    y_test["Excited"].values
)

=== Performance ===
CV r = 0.460, RE = 0.067
Test r = 0.039, RE = 0.085
Best Params: {'max_depth': 5, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 30}

=== Excitement ===
CV r = 0.450, RE = 0.088
Test r = 0.574, RE = 0.096
Best Params: {'max_depth': 3, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}


### NN

In [73]:
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np
import tensorflow as tf
import itertools

def evaluate_nn_models(
    X_perf, X_excite, y_perf, y_excite,
    X_perf_test, X_excite_test, y_perf_test, y_excite_test,
    k_folds=5, epochs=5, batch_size=8, random_state=42
):
    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    # --- Hyperparameter grid ---
    hidden_units_options = [(16,32), (8,16)]
    dropout_options = [0.2, 0.3]
    lr_options = [1e-3, 5e-4]

    param_grid = list(itertools.product(hidden_units_options, dropout_options, lr_options))

    def cross_val_score_nn(X, y, units, dropout_rate, lr):
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=random_state)
        r_list = []
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            model = Sequential([
                Dense(units[0], activation='relu', input_dim=X_train.shape[1]),
                Dropout(dropout_rate),
                Dense(units[1], activation='relu'),
                Dense(1)
            ])
            model.compile(optimizer=Adam(lr), loss='mse')
            model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

            y_pred_val = model.predict(X_val).flatten()
            r_list.append(np.corrcoef(y_val, y_pred_val)[0,1])
        return np.mean(r_list)

    # --- Performance hyperparameter tuning ---
    best_r_perf = -np.inf
    best_params_perf = None
    for units, dropout_rate, lr in param_grid:
        r = cross_val_score_nn(X_perf, y_perf, units, dropout_rate, lr)
        if r > best_r_perf:
            best_r_perf = r
            best_params_perf = (units, dropout_rate, lr)

    # Train final model on all training data with best hyperparameters
    units, dropout_rate, lr = best_params_perf
    model_perf = Sequential([
        Dense(units[0], activation='relu', input_dim=X_perf.shape[1]),
        Dropout(dropout_rate),
        Dense(units[1], activation='relu'),
        Dense(1)
    ])
    model_perf.compile(optimizer=Adam(lr), loss='mse')
    model_perf.fit(X_perf, y_perf, epochs=epochs, batch_size=batch_size, verbose=0)

    # CV metrics
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=random_state)
    r_perf_cv, re_perf_cv = [], []
    for train_idx, val_idx in kf.split(X_perf):
        y_pred_val = model_perf.predict(X_perf[val_idx]).flatten()
        r_perf_cv.append(np.corrcoef(y_perf[val_idx], y_pred_val)[0,1])
        re_perf_cv.append(np.mean(np.abs(y_pred_val - y_perf[val_idx]) / np.max(y_perf[val_idx])))

    # Test metrics
    y_test_pred_perf = model_perf.predict(X_perf_test).flatten()
    r_perf_test = np.corrcoef(y_perf_test, y_test_pred_perf)[0,1]
    re_perf_test = np.mean(np.abs(y_test_pred_perf - y_perf_test) / np.max(y_perf_test))

    # --- Excitement hyperparameter tuning ---
    best_r_excite = -np.inf
    best_params_excite = None
    for units, dropout_rate, lr in param_grid:
        r = cross_val_score_nn(X_excite, y_excite, units, dropout_rate, lr)
        if r > best_r_excite:
            best_r_excite = r
            best_params_excite = (units, dropout_rate, lr)

    units, dropout_rate, lr = best_params_excite
    model_excite = Sequential([
        Dense(units[0], activation='relu', input_dim=X_excite.shape[1]),
        Dropout(dropout_rate),
        Dense(units[1], activation='relu'),
        Dense(1)
    ])
    model_excite.compile(optimizer=Adam(lr), loss='mse')
    model_excite.fit(X_excite, y_excite, epochs=epochs, batch_size=batch_size, verbose=0)

    # CV metrics
    r_excite_cv, re_excite_cv = [], []
    for train_idx, val_idx in kf.split(X_excite):
        y_pred_val = model_excite.predict(X_excite[val_idx]).flatten()
        r_excite_cv.append(np.corrcoef(y_excite[val_idx], y_pred_val)[0,1])
        re_excite_cv.append(np.mean(np.abs(y_pred_val - y_excite[val_idx]) / np.max(y_excite[val_idx])))

    # Test metrics
    y_test_pred_excite = model_excite.predict(X_excite_test).flatten()
    r_excite_test = np.corrcoef(y_excite_test, y_test_pred_excite)[0,1]
    re_excite_test = np.mean(np.abs(y_test_pred_excite - y_excite_test) / np.max(y_excite_test))

    print("=== Performance ===")
    print(f"Best params: {best_params_perf}")
    print(f"CV r = {np.mean(r_perf_cv):.3f}, RE = {np.mean(re_perf_cv):.3f}")
    print(f"Test r = {r_perf_test:.3f}, RE = {re_perf_test:.3f}\n")

    print("=== Excitement ===")
    print(f"Best params: {best_params_excite}")
    print(f"CV r = {np.mean(r_excite_cv):.3f}, RE = {np.mean(re_excite_cv):.3f}")
    print(f"Test r = {r_excite_test:.3f}, RE = {re_excite_test:.3f}")

    metrics = {
        "CV_Performance_r": np.mean(r_perf_cv),
        "CV_Performance_RE": np.mean(re_perf_cv),
        "Test_Performance_r": r_perf_test,
        "Test_Performance_RE": re_perf_test,
        "Perf_Best_Params": best_params_perf,
        "CV_Excitement_r": np.mean(r_excite_cv),
        "CV_Excitement_RE": np.mean(re_excite_cv),
        "Test_Excitement_r": r_excite_test,
        "Test_Excitement_RE": re_excite_test,
        "Excite_Best_Params": best_params_excite
    }

    return metrics


In [74]:
metrics_nn_tfidf = evaluate_nn_models(
  X_train_tfidf_top_overall,
  X_train_tfidf_top_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_tfidf_top_overall,
  X_test_tfidf_top_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [75]:
metrics_nn_tfidf_MI = evaluate_nn_models(
  X_train_tfidf_topMI_overall,
  X_train_tfidf_topMI_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_tfidf_topMI_overall,
  X_test_tfidf_topMI_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [76]:
metrics_nn_pos = evaluate_nn_models(
  X_train_pos_top_overall,
  X_train_pos_top_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_pos_top_overall,
  X_test_pos_top_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [77]:
metrics_nn_pos_MI = evaluate_nn_models(
  X_train_pos_topMI_overall,
  X_train_pos_topMI_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_pos_topMI_overall,
  X_test_pos_topMI_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [78]:
metrics_nn_sen = evaluate_nn_models(
  X_train_sen_top_overall,
  X_train_sen_top_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_sen_top_overall,
  X_test_sen_top_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [79]:
metrics_nn_sen_MI = evaluate_nn_models(
  X_train_sen_topMI_overall,
  X_train_sen_topMI_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_sen_topMI_overall,
  X_test_sen_topMI_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [80]:
metrics_nn_bert = evaluate_nn_models(
  X_train_bert_top_overall,
  X_train_bert_top_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_bert_top_overall,
  X_test_bert_top_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [81]:
metrics_nn_bert_MI = evaluate_nn_models(
  X_train_bert_topMI_overall,
  X_train_bert_topMI_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_bert_topMI_overall,
  X_test_bert_topMI_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [82]:
metrics_nn_comb = evaluate_nn_models(
  X_train_comb_top_overall,
  X_train_comb_top_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_comb_top_overall,
  X_test_comb_top_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [83]:
metrics_nn_comb_MI = evaluate_nn_models(
  X_train_comb_topMI_overall,
  X_train_comb_topMI_excite,
  y_train["Overall"].values,
  y_train["Excited"].values,
  X_test_comb_topMI_overall,
  X_test_comb_topMI_excite,
  y_test["Overall"].values,
  y_test["Excited"].values
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 