# Install necessary dependencies

In [0]:
!pip install xlrd

# Read data

In [0]:
import pandas
import copy
import numpy as np

In [0]:
data_source = pandas.read_excel('data.xlsx')

In [0]:
with open("bags_of_words.txt","r") as f:
  keywords = f.readline()
# Clean up the words
keywords = keywords.replace("\n","")
keywords = keywords.replace("'","")
keywords = keywords.split(",")
keywords = [o.replace(" ","") for o in keywords]

In [0]:
keywordsMap = {o:0 for o in keywords}

In [0]:
def convert_sentence_to_feature(sentence):
  bag_of_words = copy.deepcopy(keywordsMap)
  for word in sentence:
    if word.lower() in bag_of_words:
      initial_value = bag_of_words[word.lower()]
      bag_of_words[word.lower()] =  initial_value + 1 
  return list(bag_of_words.values())

In [0]:
all_comments = list(data_source['Comment'])

In [0]:
all_comments = [o for o in all_comments]

In [0]:
all_comments = [o.split(" ") for o in all_comments]

In [0]:
comment_features = [convert_sentence_to_feature(comment)for comment in all_comments]

In [0]:
comment_labels = [1 if o > 2 else 0 for o in data_source["Ranking"]]

# Algorithm 

## Naive Bayes

In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(comment_features,comment_labels,test_size = 0.2)

In [0]:
mnb = MultinomialNB()

In [0]:
mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
accuracy = mnb.score(X_test,y_test)

1.0

## KNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
neighbour = KNeighborsClassifier(n_neighbors=3)

In [0]:
neighbour.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [0]:
accuracy = neighbour.score(X_test,y_test)

1.0

# SVM

In [0]:
from sklearn import svm

In [0]:
clf = svm.SVC()

In [0]:
clf.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [0]:
accuracy = clf.score(X_test,y_test)

1.0

# Patient Classifier

In [0]:
userMap = dict()
for user in data_source['User Id']:
  userMap[user] = []

In [0]:
for user in userMap:
  sentences = list(data_source.loc[data_source['User Id'] == user]['Comment'])
  sentences = [o.split(" ") for o in sentences]
  sentences = [convert_sentence_to_feature(o) for o in sentences]
  userMap[user] = sentences

In [0]:
user_labels = []
for user in userMap:
  overall_opinion = list(data_source.loc[data_source['User Id'] == user]['Overall Ranking Opinon'])[0]
  user_labels.append(overall_opinion)

In [0]:
# Switch whatever algorithm you like
chosen_algorithm = mnb
user_features = [chosen_algorithm.predict(userMap[user]) for user in userMap]

In [0]:
user_features

In [0]:
user_predictions = [np.mean(feature) for feature in user_features]

In [0]:
user_predictions = [1 if prediction >0.5 else 0 for prediction in user_predictions]
user_predictions

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [0]:
user_labels = [1 if label >2 else 0 for label in user_labels]
user_labels

[1, 0, 0, 1, 1, 1, 1, 1, 0, 0]

In [0]:
def get_result(user_predictions,user_labels):
  total_size = len(user_labels)
  correct = 0
  for i in range(total_size):
    if user_predictions[i] == user_labels[i]:
      correct += 1
  print("The accuracy is {}".format(correct * 1./total_size))

In [0]:
get_result(user_predictions,user_labels)

The accuracy is 0.4
