# Clustering
# Unsupervised Learning
- https://projector.tensorflow.org/
- https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html


In [None]:
# Combine 'title', 'directions', and 'ingredients' into a single text field for clustering
whole_df['combined_text'] = whole_df['title'] + ' ' + whole_df['directions'] + ' ' + whole_df['ner']


    # word to vec here
    # sentence bert from hugging face instead of tfidf get embeddings back 1024 length. text in recipe
    # use tsne transformation
    # results in length 3 for each recipe
    # do the clustering on these 3. 

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_tokens(text):
    tokens = tokenizer.encode(text, add_special_tokens=True,return_tensors='pt')
    # Get BERT embeddings for each token
    with torch.no_grad():
        outputs = model(tokens)
        embeddings = outputs.last_hidden_state
    return embeddings

whole_df['embeddings'] = whole_df['combined_text'].apply(get_tokens)


In [None]:
# Combine 'title', 'directions', and 'ingredients' into a single text field for clustering
whole_df['combined_text'] = whole_df['title'] + ' ' + whole_df['directions'] + ' ' + whole_df['ner']

# Text preprocessing
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

    # word to vec here
    # sentence bert from hugging face instead of tfidf get embeddings back 1024 length. text in recipe
    # use tsne transformation
    # results in length 3 for each recipe
    # do the clustering on these 3. 


whole_df['combined_tokens'] = whole_df['combined_text'].apply(preprocess_text)

# Convert tokenized text into TF-IDF matrix
tfidf = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = tfidf.fit_transform(whole_df['combined_tokens'])

# K-Means clustering
num_clusters = 5  # You can adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to the DataFrame
whole_df['cluster_label'] = kmeans.labels_

# Displaying cluster assignments
cluster_results = pd.DataFrame({
    'Recipe': whole_df['title'],
    'Cluster': whole_df['cluster_label']
})

print(cluster_results)

In [None]:
# Dimensionality reduction for visualization (if the TF-IDF matrix is high-dimensional)
svd = TruncatedSVD(n_components=2, random_state=42)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# Add cluster labels and reduced dimensions to DataFrame
whole_df['X'] = tfidf_matrix_reduced[:, 0]
whole_df['Y'] = tfidf_matrix_reduced[:, 1]

# Plotting clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x='X', y='Y', hue='cluster_label', palette='viridis', data=whole_df, legend='full')
plt.title('Clustering of Recipes based on Titles, Directions, and Ingredients')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(title='Cluster')
plt.show()

In [None]:
# Calculate TF-IDF matrix
tfidf = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = tfidf.fit_transform(whole_df['combined_tokens'])

# K-Means clustering
num_clusters = 5  # Adjust as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to the DataFrame
whole_df['cluster_label'] = kmeans.labels_

# Calculate Cluster Centroid
cluster_centroids = kmeans.cluster_centers_

# Count in Cluster
count_in_cluster = whole_df['cluster_label'].value_counts().sort_index()

# Assign Cluster ID
whole_df['cluster_id'] = whole_df['cluster_label'] + 1  # Adding 1 to start cluster ID from 1

# Add Cluster Centroid, Count in Cluster, and Cluster ID to a new DataFrame
cluster_summary = pd.DataFrame({
    'Cluster ID': whole_df['cluster_id'].unique(),
    'Count in Cluster': count_in_cluster.values,
    'Cluster Centroid': cluster_centroids.tolist()
})

print(cluster_summary)