In [None]:
# !pip install nltk
# !pip install gensim
import pandas as pd
import nltk
import re
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error




In [None]:
def simple_tokenize(text):
    """
    Tokenizer 
    """
    tokens = nltk.word_tokenize(text)
    # Optionally remove non-alphabetic tokens
    tokens = [t.lower() for t in tokens if re.match(r"^[a-zA-Z]+$", t)]
    return tokens


def build_tagged_documents(df, text_col="transcript", id_cols=("Participant")):
    """
    Convert rows of a DataFrame into TaggedDocument objects for Doc2Vec.
    """
    tagged_docs = []
    for idx, row in df.iterrows():
        text = str(row[text_col])
        tokens = simple_tokenize(text)
        
       
        if isinstance(id_cols, (list, tuple)):
            tag_list = []
            for c in id_cols:
                tag_list.append(str(row[c]))
            doc_tag = "_".join(tag_list)
        else:
            doc_tag = str(row[id_cols])  # if it's a single string
        
        tagged_docs.append(TaggedDocument(words=tokens, tags=[doc_tag]))
    return tagged_docs


In [10]:
path = "/Users/minghill/Desktop/BU/TalentTora/talentora-analysisbot/test/doc2vec/data/merged_id_scores.csv"

df = pd.read_csv(path)

nltk.download('punkt_tab')

tagged_docs = build_tagged_documents(df)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/minghill/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [11]:
print(tagged_docs)

[TaggedDocument(words=['so', 'how', 'are', 'you', 'doing', 'today', 'pretty', 'well', 'good', 'good', 'tell', 'me', 'about', 'yourself', 'i', 'mickey', 'i', 'am', 'a', 'course', 'majoring', 'in', 'mit', 'and', 'junior', 'this', 'year', 'i', 'live', 'in', 'east', 'campus', 'i', 'think', 'i', 'was', 'a', 'bit', 'too', 'oh', 'great', 'i', 'from', 'minnesota', 'tell', 'me', 'about', 'a', 'time', 'you', 'demonstrated', 'leadership', 'i', 'was', 'in', 'apm', 'i', 'was', 'the', 'fellowship', 'as', 'personal', 'project', 'okay', 'which', 'one', 'of', 'those', 'was', 'responsible', 'for', 'organizing', 'a', 'previous', 'program', 'basically', 'it', 'split', 'into', 'three', 'sections', 'okay', 'so', 'it', 'was', 'organizing', 'events', 'that', 'helped', 'foster', 'friendship', 'among', 'those', 'three', 'okay', 'so', 'tell', 'me', 'about', 'a', 'time', 'you', 'were', 'working', 'in', 'a', 'team', 'and', 'faced', 'a', 'challenge', 'how', 'did', 'you', 'solve', 'that', 'problem', 'so', 'we', 'had

In [12]:
print(len(tagged_docs))

77


In [None]:
def train_doc2vec_model(tagged_docs, vector_size=50, window=5, min_count=2, epochs=20, dm=1):
    model = Doc2Vec(
        documents=tagged_docs,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        dm=dm
    )
    model.train(tagged_docs, total_examples=model.corpus_count, epochs=epochs)
    return model


def doc2vec_inference(df, model, text_col="transcript"):
    vectors = []
    for idx, row in df.iterrows():
        tokens = simple_tokenize(row[text_col])
        vec = model.infer_vector(tokens)
        vectors.append(vec)
    
    vec_array = np.array(vectors)
    for dim in range(vec_array.shape[1]):
        df[f"d2v_{dim}"] = vec_array[:, dim]
    return df

def train_and_evaluate_regression(df, target_col="Overall", embed_prefix="d2v_"):
    feature_cols = [c for c in df.columns if c.startswith(embed_prefix)]
    df = df.dropna(subset=[target_col])
    X = df[feature_cols].values
    y = df[target_col].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    reg = Ridge(alpha=1.0)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{target_col} -- R^2: {r2:.3f}, MSE: {mse:.3f}")
    return reg

In [None]:
d2v_model = train_doc2vec_model(tagged_docs, vector_size=50, epochs=20)

'''Final DF would be merged_id_scores'''
path = "/Users/minghill/Desktop/BU/TalentTora/talentora-analysisbot/test/doc2vec/data/merged_id_scores.csv"
final_df = pd.read_csv(path)
final_df = doc2vec_inference(final_df, d2v_model)

reg_model = train_and_evaluate_regression(final_df, target_col="Overall")

'''You can similarly call train_and_evaluate_regression with "RecommendHiring", etc'''