In [82]:
import pandas
import numpy
from sklearn import *
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import pickle

Load dataset

In [56]:
DATA_FILE_URL = 'https://raw.githubusercontent.com/SasidharSekar/Classification-abalone/9b88eddc963afccf46d569eb5132310b2725120d/abalone-data.csv'
col_names = ['Sex','Length','Diameter','Height','Whole Weight','Shucked Weight','Viscera Weight','Shell Weight','Rings']
data = pandas.read_csv(DATA_FILE_URL,sep=',',quotechar='"', header=None, names=col_names)

View Data Distribution

In [None]:
print("Data Size: %d" %data.size)
print(data.head(10))
print(data.describe())
print(data.groupby('Rings').size())
excl_gender = data.iloc[:,1:]
print(excl_gender.corr())

Visualize Data Distribution

In [None]:
data.hist()
pyplot.show()
X = data.iloc[:,:-1]
X.boxplot()
pyplot.show()
scatter_matrix(data)
pyplot.show()

Model Evaluation Preparation

In [None]:
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
array = data.values
X = array[:,:-1]
y = array[:,-1]
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=1)

models = []
models.append(("LR",LogisticRegression()))
models.append(("LDA",LinearDiscriminantAnalysis()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("CART",DecisionTreeClassifier()))
models.append(("NB",GaussianNB()))
models.append(("SVM",SVC(gamma="auto")))

Model Evaluation

In [None]:
print(str(X_train.size) +" : " + str(y_train.size))
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10,shuffle=True, random_state=1)
    cv_results = cross_val_score(model,X_train,y_train,cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f(%f)' %(name, cv_results.mean(), cv_results.std()))

Compare Algorithms

In [None]:
pyplot.boxplot(results,tick_labels=names)
pyplot.title('Algorithm Comparision') 
pyplot.show()

Make Predictions

In [None]:
model = LinearDiscriminantAnalysis()
model.fit(X_train,y_train)
predictions = model.predict(X_val)
print(accuracy_score(y_val,predictions))
print(confusion_matrix(y_val,predictions))
print(classification_report(y_val,predictions))

Make individual predictions

In [None]:
str_x_test = input("Enter input parameters as Comma Separated values")
x_test = str_x_test.split(",")
X_test = numpy.array(x_test)
X_test = X_test.reshape(1,-1)
X_test = X_test.astype(float)
prediction = model.predict(X_test)
print(prediction)

[13.]


Save Model to File

In [83]:
filename = 'final_model_classification_abalone.sav'
pickle.dump(model, open(filename,'wb'))

Load Model from File and predict

In [None]:
model = pickle.load(open(filename,'rb'))
predictions = model.predict(X_val)
print(accuracy_score(y_val,predictions))