# 1. Loading in Necessary Libraries

In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression #Using simple logistic regression for saved model
import pickle

# 2. Getting Pima Indians Diabetes Data from Github

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

# 3. Attaching appropriate column names

In [3]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)

# 4. Selecting X and Y values

In [4]:
#Converting panda dataframes into an array
array = dataframe.values

#Splitting first 8 columns as X
X = array[:,0:8]
#Last column is set as Y
Y = array[:,8]

# 5. Performing Train Test Split

In [5]:
#33% of data is going to be set as Test while 67% is set as Train
test_size = 0.33
#Seed to set random state
seed = 777
#Using sklearn train_test_split on the dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

# 6. Exporting Test Data

In [6]:
#Converting array back into dataframe so that we can write out as CSV
X_test = pd.DataFrame(X_test)
Y_test = pd.DataFrame(Y_test)
#Selecting no index, otherwise CSV file will have 1 more additional column containing python indices
X_test.to_csv('X_test.csv', index = False)
Y_test.to_csv('Y_test.csv', index = False)

# 7. Training the Model

In [7]:
# Fit the model on 67%
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# 8. Re-loading Test Data

In [10]:
#Read in as CSV using pandas dataframe
X_test_loaded = pd.read_csv('X_test.csv')
Y_test_loaded = pd.read_csv('Y_test.csv')
#Need to convert back to array for model to accept
X_test_loaded = X_test_loaded.values
Y_test_loaded = Y_test_loaded.values

# 9. Loading Model

In [12]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

#Doing a scoring mechanism
result = loaded_model.score(X_test_loaded, Y_test_loaded)
#You can also do a prediction
predictions = loaded_model.predict(X_test_loaded)

#Printing out results of the scoring. This means that the model is 75.5% Accurate between Actual Y_test and Predicted Y
print(result)

0.7559055118110236
