In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
csv_file = "responses.csv"
df = pd.read_csv(csv_file)

In [None]:
df_personachat = pd.read_csv("personality.csv")

In [None]:
df_personachat['combined_text'] = df_personachat[['Persona', 'chat']].fillna('').agg(' '.join, axis=1)
df_personachat['tokens'] = df_personachat['combined_text'].apply(lambda x: x.split())

In [None]:
train_data, test_data = train_test_split(df_personachat['tokens'], test_size=0.2, random_state=42)

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(r'\s+', ' ', regex=True)
print("Cleaned Columns:", df.columns)

Cleaned Columns: Index(['timestamp', 'what's your instagram id ?', 'activity', 'conversation',
       'collaboration', 'conversation', 'combined_text', 'tokens', 'vector'],
      dtype='object')


In [None]:
column_mapping = {
    'activity: what do you want to do on weekends, something crazy that you have the means to do or want to do but your friends are never up for it. (ex. trying some crazy shack in arambol, going on a trek, coding for 16 hrs straight, partying in a villa, visiting thalassa, pranking people in your hostel, renting a car and drifting, wine tasting, anything)': 'Activity',
    'conversation: what do you care a lot about but your friends don’t give a fuck about it, but you wanted someone who was as excited about it as you are. (ex. star trek, vogue fashion magazine, osama bin ladin , consumer psychology, perfume testing, actual meaning of life , anything)': 'Conversation',
    'activity: what do you wish to do that you hoped you’d find someone to collaborate on with? (ex. collabing on a music video, making a bizz plan, jamming session on arctic monkeys, going for daily morning jogs, getting high, learning hip hop, whatever tf you want)': 'Collaboration',
    'conversation: or you someone looking for just a good, soulful conversation with someone who has empathy, patience, and slight maturity? you don\'t really care about any particular thing, just want a wholesome person to talk to. answer "yes" only if you can be the same type of person to the other person.': 'Conversation',
}
df.rename(columns=column_mapping, inplace=True)
print("Updated Columns:", df.columns)

Updated Columns: Index(['timestamp', 'what's your instagram id ?', 'Activity', 'Conversation',
       'Collaboration', 'Conversation'],
      dtype='object')


In [None]:
text_columns = ['Activity', 'Conversation', 'Collaboration']
df['combined_text'] = df[text_columns].fillna('').agg(' '.join, axis=1)
df['tokens'] = df['combined_text'].apply(lambda x: x.split())

In [None]:
model = Word2Vec(
    sentences=train_data,
    vector_size=100,  # Embedding vector size
    window=5,         # Context window size
    min_count=1,      # Minimum word frequency
    workers=4,        # Number of CPU cores to use
    epochs=10         # Number of training epochs
)

In [None]:
model.save("personachat_word2vec.model")
loaded_model = Word2Vec.load("personachat_word2vec.model")

In [None]:
def get_sentence_vector(tokens, model):
    valid_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
df['vector'] = df['tokens'].apply(lambda tokens: get_sentence_vector(tokens, model))

In [None]:
vectors = np.array(df['vector'].tolist())

In [None]:
similarity_matrix = cosine_similarity(vectors)

In [None]:
top_n = 3
matches = {}
for i, row in df.iterrows():
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = [score for score in similarity_scores if score[0] != i]
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_matches = [df.iloc[match[0]].get("what's your instagram id ?", "Unknown") for match in similarity_scores[:top_n]]
    matches[row["what's your instagram id ?"]] = top_matches

In [None]:
matches_df = pd.DataFrame.from_dict(matches, orient='index', columns=[f'Match {i+1}' for i in range(top_n)])
matches_df.to_csv('matches.csv', index_label='Instagram')

In [None]:
matches_df