## Imports

In [10]:
from azure.cosmos.exceptions import CosmosResourceExistsError
import azure.cosmos.cosmos_client as cosmos_client
from azure.cosmos.partition_key import PartitionKey

from dotenv import dotenv_values
from BagOfWords import BOW
from nb import NaiveBayes
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import mean_squared_error
import numpy as np


 ## Data Initialization & Naive Bayes
Here, we build our dataset using Bag of Words to initialize our 'X' and 'Y' arrays. Then, we analyze it using our Naive Bayes net.

In [32]:
config = dotenv_values('.env')
client = cosmos_client.CosmosClient(
    config['AZURE_SQL_HOST'],
    {
        'masterKey': config['AZURE_SQL_MASTER_KEY']
    },
    user_agent="CosmosDBPythonQuickstart",
    user_agent_overwrite=True
)
db = client.create_database_if_not_exists(id=config['AZURE_SQL_DATABASE_ID'])
print('Database with id \'{0}\' initialized'.format(config['AZURE_SQL_DATABASE_ID']))
container = db.create_container_if_not_exists(
    id=config['AZURE_SQL_CONTAINER_ID'],
    partition_key=PartitionKey(path='/ethical_tag'),
    offer_throughput=1000
)
nb = NaiveBayes()
print('Container with id \'{0}\' initialized'.format(config['AZURE_SQL_CONTAINER_ID']))

items = container.read_all_items()
items = list(items)

ethicals_full = [item for item in items if item['ethical_tag']]
ethicals = ethicals_full[:2472]

unethicals = [item for item in items if not item['ethical_tag']]

# Build the dataset with the data entries we get from the 'items' list.
dataset = ethicals_full + unethicals
# The size of ethicals_full can be modified for more or less data.
# For example, ethicals = ethicals_full[:2472]

np.random.seed(69420)
np.random.shuffle(dataset)

train = dataset[:int((0.8)*len(dataset))]
test = dataset[int((0.8)*len(dataset)):]

bow_train = BOW(train)
trainX = bow_train.X
trainY = bow_train.y

bow_test = BOW(test)
testX = bow_test.X
testY = bow_test.y

cnb = ComplementNB()
cnb.fit(trainX, trainY)

pred = cnb.predict(testX)
diffs = testY ^ pred
error_pct = sum(diffs) / len(pred)
print("error percent: " + str(error_pct))
mse = mean_squared_error(1 * testY, 1 * pred)
print("mse: " + str(mse))

# print(np.stack([testY, pred]))

Database with id 'ethiclassifier' initialized
Container with id 'data' initialized
error percent: 0.3522842639593909
mse: 0.3522842639593909


In [None]:
data = np.array([{'ethical_tag' : True, 'text' : "Help others when you can!"}])
x = BOW(data)
predic = cnb.predict(x.X)
print(predic)