In [None]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter(action = "ignore")

In [None]:
df = pd.read_csv('data\liver.csv')
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df=df.drop_duplicates()

In [None]:
df.shape

In [None]:
df['Dataset'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
x = df.drop(columns='Dataset',axis=1)
y = df['Dataset']

In [None]:
print(x)

In [None]:
print(y)

In [None]:
scaler = StandardScaler()
scaler.fit(x)

In [None]:
stad_df = scaler.transform(x)

In [None]:
print(stad_df)

In [None]:
x = stad_df
y = df['Dataset']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [None]:
print(x_train.shape,x_test.shape)

In [None]:
##using logistic regression
model = LogisticRegression()
model.fit(x_train,y_train)

In [None]:
prediction_lr = model.predict(x_test)
print(prediction_lr)

In [None]:
accuracy_lr = accuracy_score(prediction_lr,y_test)
print(accuracy_lr)

In [None]:
###using Support Vector Machine
import sklearn.svm as svm
classify = svm.SVC(kernel='linear')
classify.fit(x_train,y_train)

In [None]:
x_train_pred = classify.predict(x_train)
train_acc = accuracy_score(x_train_pred,y_train)
print(train_acc)

In [None]:
prediction_svm = classify.predict(x_test)
accuracy_svm = accuracy_score(prediction_svm,y_test)
print(accuracy_svm)

In [None]:
models = []
models.append(('LR', LogisticRegression(random_state = 12345)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state = 12345)))
models.append(('RF', RandomForestClassifier(random_state = 12345)))
models.append(('SVM', SVC(gamma='auto', random_state = 12345)))
models.append(('XGB', GradientBoostingClassifier(random_state = 12345)))
models.append(("LightGBM", LGBMClassifier(random_state = 12345)))

# evaluate each model in turn
results = []
names = []

In [None]:
for name, model in models:
        kfold = KFold(n_splits = 10, random_state = None)
        cv_results = cross_val_score(model, x, y, cv = 10, scoring= "accuracy")
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20)
classifier.fit(x_train, y_train)

In [None]:
prediction_rf = classifier.predict(x_test)
accuracy_rf = accuracy_score(prediction_rf,y_test)
print(accuracy_rf)

In [None]:
import pickle
filename = 'liver-prediction-model.pkl'
pickle.dump(classifier, open(filename, 'wb'))