In [None]:
import numpy as np
import pandas as pd

from statistics import mean
import gensim.downloader as api
from gensim.models import Word2Vec
import regex as re

from sklearn.metrics.pairwise import cosine_similarity

import spacy

In [None]:
def get_data():
    xls = pd.ExcelFile('C:/Users/marke/DSCP/compost_data/MacPherson Feeding Logs.xlsx')
    carbon_index = pd.read_excel(xls, 'Emission Carbon Index')
    df2 = pd.read_excel(xls, 'Sep to Dec 2023')
    df3 = pd.read_excel(xls, 'Jan to Jun 2024')

    new_header = carbon_index.iloc[0] #grab the first row for the header
    carbon_index = carbon_index[1:] #take the data less the header row
    carbon_index.columns = new_header

    carbon_index.drop('S/N', axis=1, inplace=True)

    tank1 = df2['Tank 1']

    return carbon_index, df2, df3, tank1

In [None]:
# Function to remove 'distilled water' segment
def remove_distilled_water(text):
    # Regex to find 'xg of distilled water'
    return re.sub(r'\d+g of distilled water,?\s*', '', text) # we remove distilled water as it is not able to give us a carbon value

In [None]:
def quantity_dictionary(row): # we want to save the quantity of each food item first before removing it
    if pd.notna(row):  # Check if the row is not NaN
        # Find all numerical values in the row
        numerical_values = re.findall(r'\d+', row)
        
        # Store the numerical values in the dictionary
        quantity_dict = numerical_values
        
        return numerical_values
    return []

In [None]:
def split_text(row):
    unwanted_words = {'of', 'dry', 'dried', 'wet', 'crushed', 'tops', 'tops,', 'skins', 'skins,', 'coffee'} 
    
    if pd.notna(row):  # Check if the row is not NaN
        # Flatten any nested lists
        if isinstance(row, list):
            row = ' '.join(row)
        
        # Split the text into words
        words = row.split(" ")
        


        # Filter out unwanted words and empty strings
        filtered_words = [word for word in words if not any(char.isdigit() for char in word) and word.lower() not in unwanted_words]
        cleaned_list = [item.strip(',') for item in filtered_words if item.strip(',')]
        
        return cleaned_list
    return []

In [None]:
def data_processing(tank1):
    food_dict = carbon_index['Food Name'].to_dict()

    food_info = carbon_index.set_index('Food Name').T.to_dict('list')

    tank1 = tank1.str.replace('1 spoon', '15g')
    tank1 = pd.DataFrame(tank1)

    tank1['Tank 1'] = tank1['Tank 1'].str.replace('\n', ', ').str.replace('(', ' ').str.replace(')', ' ')
    tank1['Tank 1'] = tank1['Tank 1'].astype(str)
    # Apply the function to the column
    tank1['Tank 1'] = tank1['Tank 1'].apply(remove_distilled_water)



    quantity_dict = {}
    quantity_dict = tank1['Tank 1'].apply(quantity_dictionary)

    df_split = tank1['Tank 1'].apply(split_text)

    for i in range(len(df_split)):
        for j in range(len(df_split[i])):
            if df_split[i][j] == "grinds": 
                df_split[i][j] = "Coffee Grounds"
            
            if df_split[i][j] == "grounds":
                df_split[i][j] = "Grounded Coffee"

    processed_sentences = df_split.copy()

    return food_dict, food_info, tank1, quantity_dict, processed_sentences


In [None]:
def get_phrase_vector(phrase, model):
    words = phrase.lower().split()
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
def model_training(model):
    # Load the model
    #model = api.load("word2vec-google-news-300")

# List of words to be processed
    words_list = food_dict

    word_list_vectors = {phrase: get_phrase_vector(phrase, model) for phrase in words_list.values()}

    return word_list_vectors

In [None]:
def find_most_similar_phrase(phrase):
    phrase_vector = get_phrase_vector(phrase, model)
    similarities = {key: cosine_similarity([phrase_vector], [vector])[0][0] for key, vector in word_list_vectors.items()}
    # Ensure the phrase is not in the result if it's not a close match
    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    most_similar_phrase = sorted_similarities[0][0] if sorted_similarities and sorted_similarities[0][1] > 0.5 else None
    return most_similar_phrase

In [None]:
def find_most_similar_phrase(phrase):
    phrase_vector = get_phrase_vector(phrase, model)
    similarities = {key: cosine_similarity([phrase_vector], [vector])[0][0] for key, vector in word_list_vectors.items()}
    # Ensure the phrase is not in the result if it's not a close match
    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    most_similar_phrase = sorted_similarities[0][0] if sorted_similarities and sorted_similarities[0][1] > 0.5 else None
    return most_similar_phrase

In [None]:
def word_similarity(processed_sentences):
    target_list = processed_sentences

    data = {'Tank 1': processed_sentences}
    df = pd.DataFrame(data)

    # Process the DataFrame
    df['Replaced Phrases'] = df['Tank 1'].apply(process_list)

    for row in df['Replaced Phrases']:
        for item in row:
            if item is None:
                row.remove(item)

    row_list = []
    index_number = 0

    for row in df['Replaced Phrases']:
        row_values = 0
        dict_number = 0
        if row is not None:
            for item in row:
                #print(item)
                carbon_value = food_info[item][0] # the carbon value of the food item
                quantity = float(quantity_dict[index_number][dict_number]) # the quantity of the food item

                row_values += carbon_value * quantity # the total carbon value of that item
                dict_number += 1
            
        index_number += 1
        row_list.append(row_values)

    df['Carbon Value (g)'] = row_list

    return df

In [None]:
model = api.load("word2vec-google-news-300")


In [None]:
from flask import Flask, request, render_template, jsonify

app = Flask(__name__)

# post request

@app.route('/')
def entire_process():
    carbon_index, df2, df3, tank1 = get_data()
    food_dict, food_info, tank1, quantity_dict, processed_sentences = data_processing(tank1)
    word_list_vectors = model_training(model)

    df = word_similarity(processed_sentences)
    carbon_value = df['Carbon Value (g)'].sum()
    carbon_value = str(carbon_value)

    return carbon_value

app.run()