In [1]:

from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import pandas as pd 
import os
import gzip

os.chdir('C:/Users/Wendy/Documents/the_finalbot/diabetes_chatbot/data_rasa')
#pd.set_option('display.max_colwidth', None)

In [2]:
def read_csvs(directory):
    dataframes = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            dataframes.append(pd.read_csv(filepath))
    return pd.concat(dataframes, ignore_index=True)


In [3]:
final = read_csvs('C:/Users/Wendy/Documents/the_finalbot/diabetes_chatbot/data_rasa')
final

Unnamed: 0,question,answer
0,"Is honey ok to use if I have diabetes, and in ...",Table sugar and honey were added to the ADA’s ...
1,What are the nutritional recommendations for i...,The meal plan recommendations for people with ...
2,"If I have a beer, do I need to count the carbs...",
3,I read that cottage cheese is a good snack to ...,"Cottage cheese is a nutrient-dense food, but i..."
4,What type of foods prevent a blood sugar spike...,You may not be able to prevent a blood sugar s...
...,...,...
183,What is the best way to test for type 1 diabet...,Type 1 diabetes means that without taking insu...
184,How does diabetes affect your mood and thinkin...,"With or without diabetes, when blood sugars ar..."
185,Will losing a large amount of weight help me t...,Changing your lifestyle which will ultimately ...
186,Can a type 2 diabetic become a type 1 diabetic...,"Yes, you can become type 1 after years of bein..."


## Data Understanding

In [4]:
class dataUnderstanding(object):
    """ Data Understanding class"""
    
    def __init__(self, df):
        self.shape = df.shape
        self.info = df.info
        self.duplicates = df.duplicated().sum()
        self.missing = df.isna().sum()
        self.dtypes = df.dtypes

In [5]:
# Instantiate the class
understanding = dataUnderstanding(final)

In [6]:
#shape of data
print(f'The data has a shape of {understanding.shape[0]} rows and {understanding.shape[1]} columns')

The data has a shape of 188 rows and 2 columns


In [7]:
#summary of dataframe
understanding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  188 non-null    object
 1   answer    187 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB


In [8]:
#the data types of the data
understanding.dtypes

question    object
answer      object
dtype: object

In [9]:
#checking for missing values
understanding.missing

question    0
answer      1
dtype: int64

This dataset has one missing value in the answers column

In [10]:
#checking for duplicates
understanding.duplicates

1

This datset has no duplicates

## Data preparation

### Completeness

Here we treat the missing values

In [11]:
final.dropna(axis=0, how='any', inplace=True)
final

Unnamed: 0,question,answer
0,"Is honey ok to use if I have diabetes, and in ...",Table sugar and honey were added to the ADA’s ...
1,What are the nutritional recommendations for i...,The meal plan recommendations for people with ...
3,I read that cottage cheese is a good snack to ...,"Cottage cheese is a nutrient-dense food, but i..."
4,What type of foods prevent a blood sugar spike...,You may not be able to prevent a blood sugar s...
5,What is a good pre-bedtime snack to prevent lows?,Many people do not need a bedtime snack prior ...
...,...,...
183,What is the best way to test for type 1 diabet...,Type 1 diabetes means that without taking insu...
184,How does diabetes affect your mood and thinkin...,"With or without diabetes, when blood sugars ar..."
185,Will losing a large amount of weight help me t...,Changing your lifestyle which will ultimately ...
186,Can a type 2 diabetic become a type 1 diabetic...,"Yes, you can become type 1 after years of bein..."


In [12]:
#checking whether the missing value has been dropped
final.isna().sum()

question    0
answer      0
dtype: int64

In [13]:
#Join the questions and answers column into one column dubbed qa
final['qa'] = final['question'].apply(lambda x:str(x)) + '' + final['answer'].apply(lambda x:str(x))

In [14]:
final

Unnamed: 0,question,answer,qa
0,"Is honey ok to use if I have diabetes, and in ...",Table sugar and honey were added to the ADA’s ...,"Is honey ok to use if I have diabetes, and in ..."
1,What are the nutritional recommendations for i...,The meal plan recommendations for people with ...,What are the nutritional recommendations for i...
3,I read that cottage cheese is a good snack to ...,"Cottage cheese is a nutrient-dense food, but i...",I read that cottage cheese is a good snack to ...
4,What type of foods prevent a blood sugar spike...,You may not be able to prevent a blood sugar s...,What type of foods prevent a blood sugar spike...
5,What is a good pre-bedtime snack to prevent lows?,Many people do not need a bedtime snack prior ...,What is a good pre-bedtime snack to prevent lo...
...,...,...,...
183,What is the best way to test for type 1 diabet...,Type 1 diabetes means that without taking insu...,What is the best way to test for type 1 diabet...
184,How does diabetes affect your mood and thinkin...,"With or without diabetes, when blood sugars ar...",How does diabetes affect your mood and thinkin...
185,Will losing a large amount of weight help me t...,Changing your lifestyle which will ultimately ...,Will losing a large amount of weight help me t...
186,Can a type 2 diabetic become a type 1 diabetic...,"Yes, you can become type 1 after years of bein...",Can a type 2 diabetic become a type 1 diabetic...


In [15]:
# Extract the 'qa' column from the 'final' dataframe and convert it to a list
final_qalist = final['qa'].to_list()

# Display the first 5 elements of the final_qalist list
final_qalist[0:5]


['Is honey ok to use if I have diabetes, and in what proportion? Or should I try something like Stevia or Swerve?Table sugar and honey were added to the ADA’s list of food choices many years ago, when nutrition research determined that a carbohydrate is a carbohydrate. This was at the same time that rapid-acting insulin came to market, which allowed much more flexibility in the diet. For an easy comparison, one tablespoon of honey or table sugar equals 15 grams of carbohydrate, and affects blood sugar the same as a medium orange. Honey can be a carbohydrate choice as part of a balanced snack, such as using a teaspoon with plain Greek yogurt, which adds the equivalent of 5 grams of carbohydrate. If you want to save your calorie and carbohydrate budget for a meal, you can use a sugar substitute that is calorie free. The choice is yours!',
 'What are the nutritional recommendations for individuals (adults) with type 1 diabetes in terms of daily carbohydrate intake? How does this mesh (if 

In [16]:
# create a list of lists where each sublist is a list of words from a document
sentence_stream = [doc.split(" ") for doc in final_qalist]

# Display the first 5 elements of the sentence_stream list
sentence_stream[0:5]


[['Is',
  'honey',
  'ok',
  'to',
  'use',
  'if',
  'I',
  'have',
  'diabetes,',
  'and',
  'in',
  'what',
  'proportion?',
  'Or',
  'should',
  'I',
  'try',
  'something',
  'like',
  'Stevia',
  'or',
  'Swerve?Table',
  'sugar',
  'and',
  'honey',
  'were',
  'added',
  'to',
  'the',
  'ADA’s',
  'list',
  'of',
  'food',
  'choices',
  'many',
  'years',
  'ago,',
  'when',
  'nutrition',
  'research',
  'determined',
  'that',
  'a',
  'carbohydrate',
  'is',
  'a',
  'carbohydrate.',
  'This',
  'was',
  'at',
  'the',
  'same',
  'time',
  'that',
  'rapid-acting',
  'insulin',
  'came',
  'to',
  'market,',
  'which',
  'allowed',
  'much',
  'more',
  'flexibility',
  'in',
  'the',
  'diet.',
  'For',
  'an',
  'easy',
  'comparison,',
  'one',
  'tablespoon',
  'of',
  'honey',
  'or',
  'table',
  'sugar',
  'equals',
  '15',
  'grams',
  'of',
  'carbohydrate,',
  'and',
  'affects',
  'blood',
  'sugar',
  'the',
  'same',
  'as',
  'a',
  'medium',
  'orange.',
 

In [29]:
# instantiate Phrases model using the sentence stream, a minimum count of 1, 
# a threshold of 0.1 and a delimiter of ' '. 
bigram = Phrases(sentence_stream, min_count=1, threshold=0.1, delimiter=' ')

# Create a memory-efficient Phraser object from the Phrases model
bigram_phraser = Phraser(bigram)



In [30]:
tokens_list = []
for sent in sentence_stream:
    tokens_ = bigram_phraser[sent]
    tokens_list.append(tokens_)

In [31]:
print(tokens_list[0:5])

[['Is', 'honey', 'ok', 'to use', 'if I', 'have diabetes,', 'and in', 'what', 'proportion?', 'Or', 'should I', 'try', 'something', 'like', 'Stevia', 'or', 'Swerve?Table', 'sugar and', 'honey', 'were', 'added', 'to the', 'ADA’s', 'list', 'of food', 'choices', 'many years', 'ago,', 'when', 'nutrition', 'research', 'determined', 'that a', 'carbohydrate is', 'a', 'carbohydrate.', 'This', 'was', 'at the', 'same', 'time', 'that', 'rapid-acting', 'insulin', 'came', 'to', 'market,', 'which', 'allowed', 'much more', 'flexibility', 'in the', 'diet.', 'For', 'an', 'easy', 'comparison,', 'one', 'tablespoon', 'of', 'honey', 'or', 'table', 'sugar', 'equals', '15 grams', 'of', 'carbohydrate,', 'and', 'affects', 'blood sugar', 'the same', 'as a', 'medium', 'orange.', 'Honey', 'can be', 'a carbohydrate', 'choice', 'as', 'part of', 'a balanced', 'snack,', 'such as', 'using a', 'teaspoon', 'with', 'plain', 'Greek', 'yogurt,', 'which', 'adds', 'the', 'equivalent', 'of', '5', 'grams of', 'carbohydrate.', 'I

In [32]:
from gensim.models import Word2Vec  # import the Word2Vec class from the gensim library

# Initialize the model
model = Word2Vec(
    tokens_list,  # The list of lists of tokens that we want to train the model on
    vector_size=200,  # The size of the embedding vectors (200 in this case)
    sg=1,  # The training algorithm to use (1 for Skip-Gram, 0 for CBOW)
    window=5  # The context window size (number of words to the left and right of the target word)
)


In [35]:
model.wv.most_similar('food')



[('the', 0.9980940818786621),
 ('blood sugar', 0.9980012774467468),
 ('that', 0.9979858994483948),
 ('and', 0.9979685544967651),
 ('or', 0.9979420900344849),
 ('your', 0.9979361295700073),
 ('in', 0.9979110956192017),
 ('any', 0.9978705048561096),
 ('are', 0.9978391528129578),
 ('to', 0.9978213310241699)]

In [36]:
#finding most similar words
model.wv.save_word2vec_format('word2vec.txt')


In [38]:

# Open the file 'word2vec.txt' in binary read mode
fp = open('word2vec.txt', 'rb')

# Read the contents of the file into a variable named 'data'
data = fp.read()

# Convert the contents of the file into a bytearray
bindata = bytearray(data)

# Use the gzip module to open a file named 'word2vec.txt.gz' in binary write mode
# and write the contents of the 'bindata' variable to this new file
with gzip.open('word2vec.txt.gz', 'wb') as f:
    f.write(bindata)
