In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB

df = pd.read_csv('drug_class.csv')
# I imported all the libraries and read drub_class.csv


In [39]:
# Clean data 
# Separate independent and dependent features to X and Y.
Y = df['Drug'].str.lower()
X = df.drop('Drug', axis=1)

# Replace strings value with numbers.
drugTypes = {'drugy':1, 'drugx':2, 'druga':3, 'drugb':4, 'drugc':5}
for key in drugTypes.keys():
    Y = Y.replace(key, drugTypes[key])
X['Sex'] = X['Sex'].replace(['F', 'M'], [1, 2])
X['BP'] = X['BP'].replace(['LOW', 'NORMAL', 'HIGH'], [0,1,2])
X['Cholesterol'] = X['Cholesterol'].replace(['LOW', 'NORMAL', 'HIGH'], [0,1,2])

# Split X and Y data into testing and training sets
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=7)


In [42]:
# Use LogisticRefression to train, predict and calculate f1 score

# Set logisticRefression parameter. 100 interations, and used liblinear as solver.
# Set x and y training data into linearRegression model
model = LogisticRegression(max_iter=100, solver='liblinear').fit(x_train, y_train)
# Predict with X testing data
predict = model.predict(x_test)

# Calculated f1 score with the predicted value and actual Y value
f1 = f1_score(y_test, predict, average='micro')
print(f1)

print(y_test.tolist())
print(predict)

0.75
[2, 1, 1, 1, 5, 5, 1, 1, 3, 2, 4, 1, 4, 1, 1, 5, 1, 1, 2, 3, 1, 2, 1, 1, 4, 3, 3, 2, 1, 1, 1, 2, 2, 5, 4, 4, 1, 1, 4, 2]
[1 1 1 1 2 2 1 2 3 2 4 1 3 1 1 2 1 1 1 3 1 2 1 1 4 3 3 2 1 1 1 2 2 2 4 2 1
 1 2 2]


In [41]:
# Use GaissianNaiveBayes to train, predict and calculate f1 score

# Set x and y training data into Naive Bayes model
model = GaussianNB().fit(x_train, y_train)
# Predict with X testing data
predict = model.predict(x_test)

# Calculated f1 score with the predicted value and actual Y value
f1 = f1_score(y_test, predict, average='micro')
print(f1)

print(y_test.tolist())
print(predict)

0.8000000000000002
[2, 1, 1, 1, 5, 5, 1, 1, 3, 2, 4, 1, 4, 1, 1, 5, 1, 1, 2, 3, 1, 2, 1, 1, 4, 3, 3, 2, 1, 1, 1, 2, 2, 5, 4, 4, 1, 1, 4, 2]
[2 1 1 1 5 5 3 5 3 2 4 1 3 1 1 5 1 3 2 3 1 2 4 3 4 3 3 2 1 1 1 2 2 5 4 4 4
 3 4 2]
