## Imports

In [15]:
from azure.cosmos.exceptions import CosmosResourceExistsError
import azure.cosmos.cosmos_client as cosmos_client
from azure.cosmos.partition_key import PartitionKey

from dotenv import dotenv_values
from BagOfWords import BOW
from nb import NaiveBayes

from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV 
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier


 ## Data Initialization & Naive Bayes
Here, we build our dataset using Bag of Words to initialize our 'X' and 'Y' arrays. Then, we analyze it using our Naive Bayes net.

In [6]:
config = dotenv_values('.env')
client = cosmos_client.CosmosClient(
    config['AZURE_SQL_HOST'],
    {
        'masterKey': config['AZURE_SQL_MASTER_KEY']
    },
    user_agent="CosmosDBPythonQuickstart",
    user_agent_overwrite=True
)
db = client.create_database_if_not_exists(id=config['AZURE_SQL_DATABASE_ID'])
print('Database with id \'{0}\' initialized'.format(config['AZURE_SQL_DATABASE_ID']))
container = db.create_container_if_not_exists(
    id=config['AZURE_SQL_CONTAINER_ID'],
    partition_key=PartitionKey(path='/ethical_tag'),
    offer_throughput=1000
)
nb = NaiveBayes()
print('Container with id \'{0}\' initialized'.format(config['AZURE_SQL_CONTAINER_ID']))

items = container.read_all_items()

Database with id 'ethiclassifier' initialized
Container with id 'data' initialized


In [7]:
config = dotenv_values('.env')
client = cosmos_client.CosmosClient(
    config['AZURE_SQL_HOST'],
    {
        'masterKey': config['AZURE_SQL_MASTER_KEY']
    },
    user_agent="CosmosDBPythonQuickstart",
    user_agent_overwrite=True
)
db = client.create_database_if_not_exists(id=config['AZURE_SQL_DATABASE_ID'])
print('Database with id \'{0}\' initialized'.format(config['AZURE_SQL_DATABASE_ID']))
container = db.create_container_if_not_exists(
    id=config['AZURE_SQL_CONTAINER_ID'],
    partition_key=PartitionKey(path='/ethical_tag'),
    offer_throughput=1000
)
nb = NaiveBayes()
print('Container with id \'{0}\' initialized'.format(config['AZURE_SQL_CONTAINER_ID']))

items = container.read_all_items()
items = list(items)

Database with id 'ethiclassifier' initialized
Container with id 'data' initialized


In [10]:
ethicals_full = [item for item in items if item['ethical_tag']]
ethicals = ethicals_full[:2472]

unethicals = [item for item in items if not item['ethical_tag']]

# Build the dataset with the data entries we get from the 'items' list.
dataset = ethicals + unethicals
np.random.seed(69420)
np.random.shuffle(dataset)

In [70]:

X = list(map(lambda x: x["text"], dataset))
vectorizer = TfidfVectorizer(stop_words='english')

pipe = Pipeline([('vec', vectorizer), ('logreg', LogisticRegression(penalty='l2'))])

pipe2 = Pipeline([('vec', vectorizer), ('cnb', ComplementNB())])

y = list(map(lambda x: x["ethical_tag"], dataset))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

pipe.fit(X_train, y_train)
acc1 = pipe.score(X_test, y_test)
cv1 = np.mean(cross_val_score(pipe, X, y, cv=5))
pred_logreg = pipe.predict(X_test)

pipe2.fit(X_train, y_train)
acc2 = pipe2.score(X_test, y_test)
cv2 = np.mean(cross_val_score(pipe2, X, y, cv=5))
pred_cnb = pipe2.predict(X_test)

print(acc1, acc2)

someone taking parking spaces crowded street refuses reverse feet politely asked so False
0.8068756319514662 0.7917087967644085


In [18]:
grid_params = {
  'alpha': np.linspace(0.01, 0.05, 2),
  'hidden_layer_sizes': [(10,10,5)],
  'learning_rate_init': np.linspace(0.1, 0.5, 2),
  'activation': ['tanh', 'relu'],
}

mlp = MLPClassifier(max_iter=50)
model = GridSearchCV(mlp, grid_params)

pipeNN = Pipeline([('vec', vectorizer), 
                        ('nn', model)])

# cv2 = np.mean(cross_val_score(pipeNN, X, y, cv=5))
# print(cv2)

In [18]:
train = dataset[:int((0.8)*len(dataset))]
test = dataset[int((0.8)*len(dataset)):]

bow_train = BOW(train)
trainX = bow_train.X
trainY = bow_train.y

bow_test = BOW(test)
testX = bow_test.X
testY = bow_test.y
# print(np.stack([testY, pred]))

In [45]:
cnb = ComplementNB()
cnb.fit(trainX, trainY)

pred = cnb.predict(testX)
diffs = testY ^ pred
error_pct = sum(diffs) / len(pred)
print("error percent: " + str(error_pct))
mse = mean_squared_error(1 * testY, 1 * pred)
print("mse: " + str(mse))

scores = cross_val_score(cnb, np.concatenate((trainX, testX)), np.concatenate((trainY, testY)), cv=5)
avg_acc = np.mean(scores)
print(avg_acc)

error percent: 0.4418604651162791
mse: 0.4418604651162791
0.6638169663873458


In [46]:
# we used l2 penalty as we have colinear / codependent features 
logreg = LogisticRegression(penalty='l2')
logreg.fit(trainX, trainY)
print(pred.shape, testY.shape)
diffs = testY ^ pred
error_pct = sum(diffs) / len(pred)
print("error percent: " + str(error_pct))
mse = mean_squared_error(1 * testY, 1 * pred)
print("mse: " + str(mse))

scores = cross_val_score(logreg, np.concatenate((trainX, testX)), np.concatenate((trainY, testY)), cv=5)
avg_acc = np.mean(scores)
print(avg_acc)

(989,) (989,)
error percent: 0.4418604651162791
mse: 0.4418604651162791
0.7040756008400093


In [89]:
ulpt = "Flying on your employer's dime? Book an expensive flight, then call the airline later to change for a cheaper fare (ie earlier/later the same day). Ask for the travel credit to be sent to your personal email. Use it later for personal travel. Your employer will not know."
lpt = "I'm 43. By your late 20's/early 30's, make sure physical fitness becomes an absolute top priority. I started a dedicated fitness regimen when I was 28 to improve my odds with a girl. Didn't work on the girl. What did work was that the routine stuck. Now pushing my mid-forties, I can't believe where I am physically compared to many others my age. Also scary is how they regard physical deterioration as an inevitability. It isn't. Get started now. It will be one of the greatest gifts you'll ever give yourself."

# ulpt = "donate to charities"

data = [ulpt, lpt]
print(pipe.predict(data))

[False  True]


# Graphs & Plots

In [95]:
from plots import plot
from PIL import Image
import pandas as pd

# plotter = plot(y_test, pred_logreg)
# plotter.fpr_fnr()

items = vectorizer.vocabulary_.items()
sorted_arr = sorted(items, key = lambda x: x[1], reverse = True)
top100 = sorted_arr[:100]
print(top100)


[('مساعدة', 13348), ('خط', 13347), ('باللغة', 13346), ('العربية', 13345), ('الأزمات', 13344), ('ɥsᴉlƃuǝ', 13343), ('écoute', 13342), ('zz', 13341), ('zoom', 13340), ('zone', 13339), ('zoes', 13338), ('zodiacs', 13337), ('zirconia', 13336), ('ziptie', 13335), ('zippers', 13334), ('ziplock', 13333), ('ziploc', 13332), ('zip', 13331), ('zimmers', 13330), ('zillow', 13329), ('zest', 13328), ('zeroed', 13327), ('zero', 13326), ('zelle', 13325), ('zealand', 13324), ('zatarains', 13323), ('yummy', 13322), ('yummier', 13321), ('yule', 13320), ('yt', 13319), ('ysk', 13318), ('yrs', 13317), ('ypu', 13316), ('youwhich', 13315), ('youve', 13314), ('youtubevanced', 13313), ('youtubes', 13312), ('youtubers', 13311), ('youtuber', 13310), ('youtube', 13309), ('youthspecific', 13308), ('youth', 13307), ('youre', 13306), ('youngsters', 13305), ('younger', 13304), ('young', 13303), ('youll', 13302), ('youhttps', 13301), ('york', 13300), ('yoinked', 13299), ('yoghurt', 13298), ('yoga', 13297), ('yob', 132

In [103]:
reddit_mask = np.array(Image.open('../images/reddit_logo.png'))
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib as plt

# from https://towardsdatascience.com/how-to-create-beautiful-word-clouds-in-python-cfcf85141214

# A similar function, but using the mask
def generate_better_wordcloud(data, title, mask=None):
    cloud = WordCloud(scale=3,
                      max_words=150,
                      colormap='RdYlGn',
                      mask=mask,
                      background_color='white',
                      collocations=True).generate_from_frequencies(data)
    plt.figure(figsize=(10,8))
    plt.imshow(cloud)
    plt.axis('off')
    plt.title(title)
    plt.show()
    
corpus = pd.DataFrame(top100)
print(corpus)
columns_titles = ["1","0"]
corpus = corpus.reindex(columns=columns_titles)
# Use the function with the rome_corpus and our mask to create word cloud     
generate_better_wordcloud(corpus, 'Rome, Italy', mask=reddit_mask)

     1   0
0  NaN NaN
1  NaN NaN
2  NaN NaN
3  NaN NaN
4  NaN NaN
..  ..  ..
95 NaN NaN
96 NaN NaN
97 NaN NaN
98 NaN NaN
99 NaN NaN

[100 rows x 2 columns]


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().