In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier

import scipy.sparse as sp

import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

Import data

In [None]:
X_train = sp.load_npz('/content/drive/MyDrive/tulu/X_train_syn.npz')

y_train = pd.Series(np.load('/content/drive/MyDrive/tulu/y_train_syn.npy'))

X_test = sp.load_npz('/content/drive/MyDrive/tulu/X_test_syn.npz')

y_test = pd.Series(np.load('/content/drive/MyDrive/tulu/y_test_syn.npy'))

In [None]:
X_train

<44236x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 250164 stored elements in Compressed Sparse Row format>

In [None]:
y_train

0        3
1        3
2        3
3        3
4        3
        ..
44231    2
44232    2
44233    2
44234    2
44235    2
Length: 44236, dtype: int64

Models

In [None]:
def modelling(tf_x_train, y_train, tf_x_test, y_test, **kwargs):

    # f1_scorer = make_scorer(f1_score, average='weighted')
    if 'mb' in kwargs.keys() and kwargs['mb']:
        mb = MultinomialNB(alpha=0.1)
        mb.fit(tf_x_train,y_train)
        y_test_pred=mb.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred, output_dict=True)
        print("Multinomial Naive Bayes:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(mb, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'rf' in kwargs.keys() and kwargs['rf']:
        # Params checked
        rf = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='log2',
                       n_estimators=500, random_state=42, min_samples_split = 7)
        rf.fit(tf_x_train,y_train)
        y_test_pred=rf.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True)
        print("Random Forest:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(rf, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'lr' in kwargs.keys() and kwargs['lr']:
        lr = LogisticRegression(C=10.0, max_iter=10000, penalty='l1', solver='liblinear')
        lr.fit(tf_x_train,y_train)
        y_test_pred=lr.predict(tf_x_test)

        lr_report=classification_report(y_test, y_test_pred,output_dict=True)
        print("LR:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(lr, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'svm' in kwargs.keys() and kwargs['svm']:
        svm = LinearSVC(random_state=0,  max_iter=5000, C=0.1, penalty = 'l2')
        svm.fit(tf_x_train,y_train)
        y_test_pred=svm.predict(tf_x_test)

        report=classification_report(y_test, y_test_pred,output_dict=True)
        print("SVM:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(svm, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'dt' in kwargs.keys() and kwargs['dt']:
        # Params checked
        dt = DecisionTreeClassifier(ccp_alpha=0.0001, criterion='gini', max_depth=100, min_samples_split = 10, random_state=1024)
        dt.fit(tf_x_train,y_train)
        y_test_pred=dt.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True)
        print("Decision Tree:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(dt, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'knn' in kwargs.keys() and kwargs['knn']:
        # Params checked
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(tf_x_train,y_train)
        y_test_pred=knn.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True)
        print("KNN:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(knn, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    if 'mlp' in kwargs.keys() and kwargs['mlp']:
        mlp = MLPClassifier(random_state=1, max_iter=300)
        mlp.fit(tf_x_train,y_train)
        y_test_pred=mlp.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred,output_dict=True)
        print("Multi-Layer Perceptron:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)

        # scores = cross_val_score(mlp, tf_x_test, y_test, cv=5, scoring=f1_scorer)
        # print("Cross-validation scores:", scores)
        # print("Average score:", scores.mean())

        print()
        print()

    return 1

In [None]:
modelling(X_train, y_train, X_test, y_test, knn=True, svm=True, lr=True, mb=True, dt=True, rf=True)

Multinomial Naive Bayes:
------------------
F1 Score in Weighted Average:  0.6362525315500142
{'0': {'precision': 0.25, 'recall': 0.2736842105263158, 'f1-score': 0.2613065326633166, 'support': 95}, '1': {'precision': 0.35294117647058826, 'recall': 0.4444444444444444, 'f1-score': 0.39344262295081966, 'support': 54}, '2': {'precision': 0.5705882352941176, 'recall': 0.5950920245398773, 'f1-score': 0.5825825825825826, 'support': 163}, '3': {'precision': 0.8764044943820225, 'recall': 0.7878787878787878, 'f1-score': 0.8297872340425531, 'support': 297}, 'accuracy': 0.625615763546798, 'macro avg': {'precision': 0.5124834765366821, 'recall': 0.5252748668473564, 'f1-score': 0.516779743059818, 'support': 609}, 'weighted avg': {'precision': 0.6504217417304, 'recall': 0.625615763546798, 'f1-score': 0.6362525315500142, 'support': 609}}


Random Forest:
------------------
F1 Score in Weighted Average:  0.6255448202480735
{'0': {'precision': 0.29545454545454547, 'recall': 0.2736842105263158, 'f1-score

1

In [None]:
def modelling_ensemble(tf_x_train, y_train, tf_x_test, y_test, **kwargs):

    if 'ada' in kwargs.keys() and kwargs['ada']:
        ada = AdaBoostClassifier(learning_rate = 0.5, n_estimators = 300)
        ada.fit(X_train,y_train)
        y_test_pred=ada.predict(X_test)
        report=classification_report(y_test, y_test_pred,output_dict=True)
        print("Adaboost:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'ovr' in kwargs.keys() and kwargs['ovr']:
        lr_classifier = LogisticRegression(C=10, solver='liblinear', max_iter = 10000)
        ovr_classifier = OneVsRestClassifier(lr_classifier)
        ovr_classifier.fit(X_train, y_train)
        y_test_pred = ovr_classifier.predict(X_test)
        report=classification_report(y_test, y_test_pred,output_dict=True)
        print("OneVsRest:")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'xgb' in kwargs.keys() and kwargs['xgb']:
        #Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
        xbg = xgb.XGBClassifier(objective='reg:squarederror', colsample_bytree= 0.9, learning_rate= 0.1, max_depth = 10, n_estimators = 200)
        xbg.fit(tf_x_train,y_train)
        y_test_pred=xbg.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred, output_dict=True)
        print("XGBoost: ")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()

    if 'gb' in kwargs.keys() and kwargs['gb']:
        bg = GradientBoostingClassifier(n_estimators=50, learning_rate=0.5, max_depth=10, random_state=0)
        bg.fit(tf_x_train,y_train)
        y_test_pred=bg.predict(tf_x_test)
        report=classification_report(y_test, y_test_pred, output_dict=True)
        print("Gradient Boost: ")
        print("------------------")
        print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
        print(report)
        print()
        print()


    return 1

In [None]:
modelling_ensemble(X_train, y_train, X_test, y_test, gb=True, ada=True, ovr = True)

Adaboost:
------------------
F1 Score in Weighted Average:  0.5804650796983588
{'0': {'precision': 0.25, 'recall': 0.3368421052631579, 'f1-score': 0.2869955156950673, 'support': 95}, '1': {'precision': 0.40625, 'recall': 0.24074074074074073, 'f1-score': 0.3023255813953488, 'support': 54}, '2': {'precision': 0.45064377682403434, 'recall': 0.6441717791411042, 'f1-score': 0.5303030303030303, 'support': 163}, '3': {'precision': 0.8935185185185185, 'recall': 0.6498316498316499, 'f1-score': 0.7524366471734893, 'support': 297}, 'accuracy': 0.5632183908045977, 'macro avg': {'precision': 0.5001030738356382, 'recall': 0.4678965687441632, 'f1-score': 0.46801519364173394, 'support': 609}, 'weighted avg': {'precision': 0.6313915199052834, 'recall': 0.5632183908045977, 'f1-score': 0.5804650796983588, 'support': 609}}


OneVsRest:
------------------
F1 Score in Weighted Average:  0.6303225649165293
{'0': {'precision': 0.29896907216494845, 'recall': 0.30526315789473685, 'f1-score': 0.30208333333333337

1

In [None]:
# Voting Classifier

lr_clf = LogisticRegression(C=10.0, max_iter=10000, penalty='l1', solver='liblinear')
rf_clf = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='log2',
                       n_estimators=500, random_state=42, min_samples_split = 7)
svm_clf = LinearSVC(random_state=0,  max_iter=5000, C=0.1, penalty = 'l2')

voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('rf', rf_clf), ('svm', svm_clf)], voting='hard')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

report = classification_report(y_test, y_pred,output_dict=True)

print("Voting Classifier: ")
print("------------------")
print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
print(report)
print()
print()

Voting Classifier: 
------------------
F1 Score in Weighted Average:  0.6487200803524548
{'0': {'precision': 0.3434343434343434, 'recall': 0.35789473684210527, 'f1-score': 0.35051546391752575, 'support': 95}, '1': {'precision': 0.35384615384615387, 'recall': 0.42592592592592593, 'f1-score': 0.38655462184873957, 'support': 54}, '2': {'precision': 0.5674157303370787, 'recall': 0.6196319018404908, 'f1-score': 0.5923753665689149, 'support': 163}, '3': {'precision': 0.8689138576779026, 'recall': 0.7811447811447811, 'f1-score': 0.822695035460993, 'support': 297}, 'accuracy': 0.6403940886699507, 'macro avg': {'precision': 0.5334025213238697, 'recall': 0.5461493364383257, 'f1-score': 0.5380351219490433, 'support': 609}, 'weighted avg': {'precision': 0.6605749338411098, 'recall': 0.6403940886699507, 'f1-score': 0.6487200803524548, 'support': 609}}




In [None]:
# Stacking Classifier

clf1 = LinearSVC(random_state=0,  max_iter=5000, C=0.1, penalty = 'l2')
clf2 = RandomForestClassifier(criterion='entropy', max_depth=8, max_features='log2',
                       n_estimators=500, random_state=42, min_samples_split = 7)
clf3 = MultinomialNB(alpha=0.1)

meta_clf = LogisticRegression(C=10.0, max_iter=10000, penalty='l1', solver='liblinear')


stacking_clf = StackingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('mb', clf3)], final_estimator=meta_clf)
stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)
report = classification_report(y_test, y_pred,output_dict=True)

print("Stacking Classifier: ")
print("------------------")
print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
print(report)
print()
print()

Stacking Classifier: 
------------------
F1 Score in Weighted Average:  0.6495373950973585
{'0': {'precision': 0.29473684210526313, 'recall': 0.29473684210526313, 'f1-score': 0.29473684210526313, 'support': 95}, '1': {'precision': 0.3728813559322034, 'recall': 0.4074074074074074, 'f1-score': 0.3893805309734514, 'support': 54}, '2': {'precision': 0.5988023952095808, 'recall': 0.6134969325153374, 'f1-score': 0.6060606060606061, 'support': 163}, '3': {'precision': 0.8472222222222222, 'recall': 0.8215488215488216, 'f1-score': 0.8341880341880342, 'support': 297}, 'accuracy': 0.6469622331691297, 'macro avg': {'precision': 0.5284107038673174, 'recall': 0.5342975008942075, 'f1-score': 0.5310915033318387, 'support': 609}, 'weighted avg': {'precision': 0.6524883146789829, 'recall': 0.6469622331691297, 'f1-score': 0.6495373950973585, 'support': 609}}




In [None]:
# Bagging Classifier

base_classifier = MultinomialNB(alpha=0.1)

bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)

# Train the bagging classifier on the training data
bagging_classifier.fit(X_train, y_train)
y_test_pred=bagging_classifier.predict(X_test)
report=classification_report(y_test, y_test_pred,output_dict=True)
print("Bagging Classifier: ")
print("------------------")
print('F1 Score in Weighted Average: ', report['weighted avg']['f1-score'])
print(report)


Bagging Classifier: 
------------------
F1 Score in Weighted Average:  0.648981522454134
{'0': {'precision': 0.28, 'recall': 0.29473684210526313, 'f1-score': 0.28717948717948716, 'support': 95}, '1': {'precision': 0.35384615384615387, 'recall': 0.42592592592592593, 'f1-score': 0.38655462184873957, 'support': 54}, '2': {'precision': 0.5909090909090909, 'recall': 0.6380368098159509, 'f1-score': 0.6135693215339233, 'support': 163}, '3': {'precision': 0.8768656716417911, 'recall': 0.7912457912457912, 'f1-score': 0.8318584070796461, 'support': 297}, 'accuracy': 0.6403940886699507, 'macro avg': {'precision': 0.525405229099259, 'recall': 0.5374863422732328, 'f1-score': 0.529790459410449, 'support': 609}, 'weighted avg': {'precision': 0.6608456134704205, 'recall': 0.6403940886699507, 'f1-score': 0.648981522454134, 'support': 609}}


In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from scipy.sparse import csr_matrix

# Convert the sparse matrix to a dense array
X_train = csr_matrix.toarray(X_train)
X_test = csr_matrix.toarray(X_test)

# Reshape the input array to have a third dimension
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

# Define the LSTM model
num_classes = len(np.unique(y_train))
model = Sequential()
model.add(Bidirectional(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]))))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Encode labels
le = LabelEncoder()
le.fit(np.unique(y_train))
y_train_encoded = le.transform(y_train)
y_train_encoded = to_categorical(y_train_encoded, num_classes=num_classes)

# Train the model
model.fit(X_train, y_train_encoded, validation_split=0.2, epochs=10, batch_size=128)

# Evaluate the model

X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
y_test_encoded = le.transform(y_test)
y_test_encoded = to_categorical(y_test_encoded, num_classes=num_classes)
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print("Test loss:", loss)
print("Test accuracy:", accuracy)

# Generate predictions
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test_encoded, axis=1)

# Print classification report
target_names = le.classes_
print(classification_report(y_test, y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 1.3637601137161255
Test accuracy: 0.31602373719215393
              precision    recall  f1-score   support

           0       0.17      0.52      0.26       114
           1       0.00      0.00      0.00        81
           2       0.00      0.00      0.00       164
           3       0.47      0.49      0.48       315

    accuracy                           0.32       674
   macro avg       0.16      0.25      0.18       674
weighted avg       0.25      0.32      0.27       674



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
