In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import *
from scipy.sparse import load_npz

In [60]:
import sys
sys.path.append("../")
from helper_functions import performance

# For Dataset 1

In [65]:
xtrain = load_npz('Data/xtrain.npz')
xtest = load_npz('Data/xtest.npz')
ytrain = np.load('Data/ytrain.npy')
ytest = np.load('Data/ytest.npy')

In [4]:
logreg = LogisticRegression(max_iter=500)
svm = SVC()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gnb = GaussianNB()
boost = XGBClassifier()

In [5]:
logreg.fit(xtrain, ytrain)
svm.fit(xtrain, ytrain)
tree.fit(xtrain, ytrain)
forest.fit(xtrain, ytrain)
boost.fit(xtrain, ytrain)

In [7]:
logreg_pred = logreg.predict(xtest)
svm_pred = svm.predict(xtest)
tree_pred = tree.predict(xtest)
forest_pred = forest.predict(xtest)
boost_pred = boost.predict(xtest)

In [8]:
print("Accuracy of Logistic Regression = ", accuracy_score(ytest, logreg_pred))
print("Accuracy of SVC = ", accuracy_score(ytest, svm_pred))
print("Accuracy of Decision Tree = ", accuracy_score(ytest, tree_pred))
print("Accuracy of Random Forest = ", accuracy_score(ytest, forest_pred))
print("Accuracy of XGBoost = ", accuracy_score(ytest, boost_pred))

Accuracy of Logistic Regression =  0.5829222011385199
Accuracy of SVC =  0.5757115749525616
Accuracy of Decision Tree =  0.3377609108159393
Accuracy of Random Forest =  0.4774193548387097
Accuracy of XGBoost =  0.5263757115749526


In [22]:
lr = LogisticRegression(solver='saga', max_iter=1000, C=1.2)
lr.fit(xtrain, ytrain)
lr.score(xtest, ytest)

0.5863377609108159

In [35]:
vector = SVC(C=1.5)
vector.fit(xtrain, ytrain)
vector.score(xtest, ytest)

0.5889943074003795

In [63]:
cl = XGBClassifier(learning_rate = 0.2, max_depth = 5)
cl.fit(xtrain, ytrain)
cl.score(xtest, ytest)

0.5392789373814042

In [67]:
joblib.dump(lr, 'Models/Dataset 1/logreg.pk1')
joblib.dump(vector, 'Models/Dataset 1/svc.pk1')
joblib.dump(cl, 'Models/Dataset 1/XGB.pk1')

['Models/Dataset 1/XGB.pk1']

In [68]:
models = [
    ('xgb', cl),
    ('svc', vector),
    ('logreg', lr)
]
stack_model_1 = StackingClassifier(estimators=models, final_estimator=LogisticRegression())
stack_model_2 = StackingClassifier(estimators=models, final_estimator=SVC())
stack_model_3 = StackingClassifier(estimators=models, final_estimator=XGBClassifier())

stack_model_1.fit(xtrain, ytrain)
stack_model_2.fit(xtrain, ytrain)
stack_model_3.fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
print(stack_model_1.score(xtest, ytest))
print(stack_model_2.score(xtest, ytest))
print(stack_model_3.score(xtest, ytest))

0.5878557874762809
0.5662239089184061
0.5886148007590133


In [71]:
stack_model_1 = StackingClassifier(estimators=models, final_estimator=LogisticRegression(max_iter=1000))
stack_model_1.fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [72]:
print(stack_model_1.score(xtest, ytest))

0.5901328273244781


In [74]:
stack_model_2 = StackingClassifier(estimators=models, final_estimator=SVC(C=1.5))
stack_model_2.fit(xtrain, ytrain)
stack_model_2.score(xtest, ytest)

0.5719165085388994

In [75]:
joblib.dump(stack_model_1, 'Models/Dataset 1/stack1.pk1')
joblib.dump(stack_model_2, 'Models/Dataset 1/stack2.pk1')
joblib.dump(stack_model_3, 'Models/Dataset 1/stack3.pk1')

['Models/Dataset 1/stack3.pk1']