In [None]:
import numpy as npy
import pandas as pds
import matplotlib.pyplot as mplt
import seaborn as sbn
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler as stl
%matplotlib inline

train_data = pds.read_csv('trainDataSMOTE.csv')
test_data = pds.read_csv('testData.csv')
my_data = pds.concat([train_data, test_data], axis = 0)
my_data.describe()

In [None]:
print(my_data.shape)
duplicate_data = my_data[my_data.duplicated()]
duplicate_data.head()

In [None]:
my_data.isna().sum()

In [None]:
from sklearn.feature_selection import SelectKBest as skb
from sklearn.feature_selection import chi2

attributes = my_data.iloc[:,0:15]
target_class = my_data.iloc[:,-1]

obj = skb(score_func=chi2)
values = obj.fit(attributes,target_class)
data_scores = pds.DataFrame(values.scores_)
data_columns = pds.DataFrame(attributes.columns)

attributes_scores = pds.concat([data_columns,data_scores],axis=1)
attributes_scores.columns = ['Name','Score']
attributes_scores = attributes_scores.sort_values(by='Score', ascending = False)
attributes_scores
#final_attributes = skb(chi2, k=10).fit_transform(attributes, target_class)
#print(final_attributes.shape,' ',final_attributes[0:10])

In [None]:
mplt.cla()
mplt.clf()
mplt.figure(figsize=(5,10))
sbn.barplot(x='Score', y='Name', data=attributes_scores)
#mplt.box(False)
#mplt.grid(True)
mplt.title('Feature importance', fontsize=18)
mplt.xlabel('Importance\n ', fontsize=15)
mplt.ylabel('Features \n', fontsize=15)
mplt.rc('axes', labelsize=10)
mplt.show()

In [None]:
mplt.cla()
mplt.clf()
corrmat = my_data.corr()
c = pds.DataFrame(corrmat.iloc[:,-1])
print(c.sort_values(by = 'TenYearCHD', ascending = False))
mplt.figure(figsize=(2,15))
sbn.heatmap(c.sort_values(by = 'TenYearCHD', ascending = False), annot = True, fmt = '.2g')
mplt.savefig('featureCorre.png')

In [None]:
train_dataF = train_data[['sysBP', 'glucose', 'age', 'totChol', 'male', 'diaBP', 'prevalentHyp', 'diabetes', 'BPMeds', 'cigsPerDay', 'TenYearCHD']]
test_dataF =  test_data[['sysBP', 'glucose', 'age', 'totChol', 'male', 'diaBP', 'prevalentHyp', 'diabetes', 'BPMeds', 'cigsPerDay', 'TenYearCHD']]
my_data2 = my_data[['sysBP', 'glucose', 'age', 'totChol', 'male', 'diaBP', 'prevalentHyp', 'diabetes', 'BPMeds', 'cigsPerDay', 'TenYearCHD']]
test_dataF.head(10)

In [None]:
train_dataF.head(10)

In [None]:
train_input = train_dataF.iloc[:,0:10]
train_label = train_dataF.iloc[:,-1]
test_input = test_dataF.iloc[:,0:10]
test_label = test_dataF.iloc[:,-1]

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver = 'liblinear')
logistic_model.fit(train_input, train_label)
logistic_predicts = logistic_model.predict(test_input)

In [None]:
accuracy = logistic_model.score(test_input, test_label)
accuracyt = logistic_model.score(train_input, train_label)
print('Logistic Regression \nTest accuracy : ', accuracy, '\nTrain accuracy : ', accuracyt)

In [None]:
from sklearn.metrics import confusion_matrix
mplt.cla()
mplt.clf()
#mplt.figure(figsize = (8,4))
cm = confusion_matrix(test_label, logistic_predicts)
print(cm)
sbn.heatmap(cm, annot = True, fmt = 'd')

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel = 'linear', probability = True)

svm_model.fit(train_input, train_label)
svm_predicts = svm_model.predict(test_input)

In [None]:
accu_svm = svm_model.score(test_input, test_label)
accut_svm = svm_model.score(train_input, train_label)
print('Support Vector Models \nTrain accuracy : ', accut_svm, '\nTest accuracy : ', accu_svm)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k1_model = KNeighborsClassifier(n_neighbors = 1, metric = 'manhattan').fit(train_input, train_label)
k5_model = KNeighborsClassifier(n_neighbors = 5, metric = 'manhattan').fit(train_input, train_label)
k10_model = KNeighborsClassifier(n_neighbors = 10, metric = 'manhattan').fit(train_input, train_label)

k1_predicts = k1_model.predict(test_input)
k5_predicts = k5_model.predict(test_input)
k10_predicts = k10_model.predict(test_input)

In [None]:
accu_knn1 = k1_model.score(test_input, test_label)
accu_knn5 = k5_model.score(test_input, test_label)
accu_knn10 = k10_model.score(test_input, test_label)
print('Test accuracy \nKNN 1 : ', accu_knn1, '\nKNN 5 : ', accu_knn5, '\nKNN 10 : ', accu_knn10)

In [None]:
accut_knn1 = k1_model.score(train_input, train_label)
accut_knn5 = k5_model.score(train_input, train_label)
accut_knn10 = k10_model.score(train_input, train_label)
print('Train accuracy \nKNN 1 : ', accut_knn1, '\nKNN 5 : ', accut_knn5, '\nKNN 10 : ', accut_knn10)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(train_input, train_label)

nb_predicts = nb_model.predict(test_input)
accu_nb = nb_model.score(test_input, test_label)
accut_nb = nb_model.score(train_input, train_label)
print('Naive Bayes \nTrain accuracy : ', accut_nb, '\nTest accuracy : ', accu_nb)

In [None]:
from sklearn.tree import DecisionTreeClassifier

DTC_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
DTC_model.fit(train_input, train_label)
DTC_predicts = DTC_model.predict(test_input)
accu_dtc = DTC_model.score(test_input, test_label)
accut_dtc = DTC_model.score(train_input, train_label)
print('Decision Tree\nTrain accuracy : ', accu_dtc, '\nTest accuracy : ', accut_dtc)

In [None]:
mplt.cla()
mplt.clf()
from sklearn import tree
mplt.figure(figsize=(30,30))
tree.plot_tree(DTC_model.fit(test_input, test_label))
mplt.savefig('DT.png')

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(DTC_model, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
from sklearn.ensemble import VotingClassifier

vc_model = VotingClassifier(estimators=[('1', k10_model), ('2', DTC_model), ('3', nb_model)], voting='soft')
vc_model.fit(train_input, train_label)
print(vc_model.score(test_input, test_label))
print(vc_model.score(train_input, train_label))

In [None]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(activation = 'logistic', solver = 'sgd', hidden_layer_sizes = (10, 15, 10, 15))
NN.fit(train_input, train_label)
NN_predicts = NN.predict(test_input)
print(NN.score(test_input, test_label))
print(NN.score(train_input, train_label))