In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def plot_confusion_matrix(y,y_predict):
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['did not land', 'land']); ax.yaxis.set_ticklabels(['did not land', 'landed']) 
    plt.show() 

In [None]:
data = pd.read_csv("dataset_part_2.csv")
X = pd.read_csv("dataset_part_3.csv")

In [None]:
print(X.head())
print(data.head())

In [None]:
#preparing data for fitting into model
Y = data['Class'].to_numpy() #creating numpy array using column 
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X) #standardise dataset X

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)

In [None]:
#using logistic regression to predict class 
#using GridSearchCV to look for best parameter
parameters ={"C":[0.01,0.1,1],'penalty':['l2'], 'solver':['lbfgs']}# l1 lasso l2 ridge
lr=LogisticRegression()
logreg_cv = GridSearchCV(lr, param_grid=parameters, cv = 5)
logreg_cv.fit(X_train, Y_train)

In [None]:
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)
print("Accuracy on test data:", logreg_cv.score(X_test, Y_test))
yhat=logreg_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)
print("True Postive - 12 (True label is landed, Predicted label is also landed)\nFalse Postive - 3 (True label is not landed, Predicted label is landed)")

In [None]:
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'),
              'C': np.logspace(-3, 3, 5),
              'gamma':np.logspace(-3, 3, 5)}
svm = SVC()
svm_cv = GridSearchCV(svm, param_grid=parameters, cv = 10)
svm_cv.fit(X_train, Y_train)

In [None]:
print("tuned hpyerparameters :(best parameters) ",svm_cv.best_params_)
print("accuracy :",svm_cv.best_score_)
print("accuracy of test data:",svm_cv.score(X_test, Y_test))
yhat=svm_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)
#print("True Postive - 12 (True label is landed, Predicted label is also landed)\nFalse Postive - 3 (True label is not landed, Predicted label is landed)")

In [None]:
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}

tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, param_grid = parameters, cv = 10)
tree_cv.fit(X_train, Y_train)

In [None]:
print("tuned hpyerparameters :(best parameters) ",tree_cv.best_params_)
print("accuracy :",tree_cv.best_score_)
print("accuracy of test data:", tree_cv.score(X_test, Y_test))
yhat = tree_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)
#print("True Postive - 12 (True label is landed, Predicted label is also landed)\nFalse Postive - 3 (True label is not landed, Predicted label is landed)")

In [None]:
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

KNN = KNeighborsClassifier()
knn_cv = GridSearchCV(KNN, param_grid = parameters, cv = 10)
knn_cv.fit(X_train, Y_train)

In [None]:
print("tuned hpyerparameters :(best parameters) ",knn_cv.best_params_)
print("accuracy :",knn_cv.best_score_)
print("accuracy of test data:",knn_cv.score(X_test, Y_test))
yhat = knn_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

In [None]:
#plotting bar chart on accuracy comparison of model
X = ["logistic regression", "support vector machine", "decision tree","k nearest neighbour"]
Y = [0.8342857142857143, 0.8482142857142856, 0.9035714285714287, 0.8482142857142858]
plt.bar(X, Y, color = "brown")
plt.title("Accuracy of models")
plt.ylabel("Accuracy")
plt.xlabel("Models")
plt.xticks(rotation = 25)