### Simple logistic regression

In [1]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import jsonlines
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from numpy import mean
from numpy import std


In [2]:
## GET DATA
CLS_TOKEN_INDEX = 0

df = pd.read_json("../MUStARD/data/sarcasm_data.json")

df = df.transpose()

embeddings = []

with jsonlines.open("../MUStARD/data/bert-output.jsonl") as utterances:
   
    for utterance in utterances:

        features = utterance["features"][CLS_TOKEN_INDEX]

        bert_embedding_target = np.mean([np.array(features["layers"][layer]["values"])
                                            for layer in range(4)], axis=0)
        
        embeddings.append(np.copy(bert_embedding_target))



## SPLIT DATA
#test_size = 207
#train_size = len(df) - test_size

output_labels = df["sarcasm"].astype(int)

# Split data into train, validation and test sets
# First, split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, output_labels, test_size=0.3, random_state=42)
# Then, split further to train and validation sets
# but only if gridsearchch is not used.
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) # comment this line if grid search in use

# Normalize input features
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
#X_val_norm = scaler.transform(X_val)
X_test_norm = scaler.transform(X_test)

In [3]:
## FIND THE BEST PARAMETERS FOR LOGISTIC REGRESSION CLASSIFIER

# Define your model
model = LogisticRegression()

# Define the parameter grid to search over
param_grid = {'C': [0.1, 1.0, 3.0, 4.0, 5.0, 6.0, 10.0, 100.0 ],
               'penalty': ['l1', 'l2'],
                 'solver': ["liblinear", "sag", "saga", "newton-cg", "lbfgs"]}

# Perform a grid search to find the best parameters for your model
# NOTE: when using GridSearchCV, there is no need for a separate validation set.
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_norm, y_train)

# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.6506317772076277


 0.65063178 0.65063178 0.65063178 0.65063178 0.64380771        nan
 0.6357422         nan        nan 0.6157457  0.61572632 0.62101277
 0.6157457  0.61713127 0.61879051        nan 0.62364476        nan
        nan 0.60690539 0.61169364 0.62046951 0.60690539 0.60673419
 0.60741662        nan 0.61764461        nan        nan 0.60526899
 0.61501287 0.61921532 0.60526899 0.61261987 0.60108117        nan
 0.62224231        nan        nan 0.60676303 0.60889652 0.61921532
 0.60676303 0.60673419 0.58780633        nan 0.62241661        nan
        nan 0.603753   0.61356359 0.61921532 0.603753   0.60676303
 0.58028055        nan 0.61918644        nan        nan 0.61026211
 0.60799642 0.61456415 0.61026211 0.60825709 0.57880595        nan
 0.61456415        nan        nan 0.61513497 0.60926835 0.61456415
 0.61513497 0.61431554]


In [11]:
## EVALUATE ON TEST SET 

# Define your model with the best parameters
# C=1.0 works as well
# {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
model = LogisticRegression(C=0.1, penalty='l2', solver='liblinear')

# Train your model on the entire training set
model.fit(X_train_norm, y_train)

# Make predictions on new data
y_pred = model.predict(X_test_norm)

reportDict = classification_report(y_test, y_pred, output_dict=True)
f1_score = reportDict['weighted avg']['f1-score']
print("TEST F1-score:", f1_score)

print(classification_report(y_test, y_pred))#, output_dict=True, digits=3))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

TEST F1-score: 0.6595012498194501
              precision    recall  f1-score   support

           0       0.64      0.57      0.61        94
           1       0.67      0.73      0.70       113

    accuracy                           0.66       207
   macro avg       0.66      0.65      0.66       207
weighted avg       0.66      0.66      0.66       207

Confusion Matrix: 
 [[54 40]
 [30 83]]


## Ensemble methods

### Bagging

In [14]:
from sklearn.ensemble import BaggingClassifier

# {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
# (C=3.0, penalty='l1', solver='saga')
"""
[LogisticRegression(C=0.1, penalty='l2', solver='liblinear'),
                       LogisticRegression(C=3.0, penalty='l1', solver='saga')]
"""

param_grid = {
    'base_estimator': [LogisticRegression(C=0.1, penalty='l2', solver='liblinear')],
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.8, 1.0],
    'max_features': [0.5, 0.8, 1.0],
    'random_state' : [42],
}

bagging_clf = BaggingClassifier()
grid_search_bagging = GridSearchCV(bagging_clf, param_grid, cv=5, scoring='f1')
grid_search_bagging.fit(X_train_norm, y_train)

# Get the best parameters and the best Bagging model
best_params_bagging = grid_search_bagging.best_params_
best_bagging = grid_search_bagging.best_estimator_

print(f'The best estimator is {best_params_bagging}')

""""
f1_scores = cross_val_score(model, X_val_norm, y_val, cv=5, scoring='f1')
print('CV F1 score: %.3f (%.3f)' % (mean(f1_scores), std(f1_scores)))
reportDict = classification_report(y_val, y_pred, output_dict=True)
f1_score = reportDict['weighted avg']['f1-score']
print("F1-score:", f1_score)
print(classification_report(y_val, y_pred))#, output_dict=True, digits=3))
print("Confusion Matrix: \n", confusion_matrix(y_val, y_pred))
"""

# test and evaluation
test_y_pred = grid_search_bagging.predict(X_test_norm)

reportDict = classification_report(y_test, test_y_pred, output_dict=True)
f1_score = reportDict['weighted avg']['f1-score']
print("TEST F1-score:", f1_score)
print(classification_report(y_test, test_y_pred))#, output_dict=True, digits=3))
print("Confusion Matrix: \n", confusion_matrix(y_test, test_y_pred))

### Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
# (C=3.0, penalty='l1', solver='saga')

param_grid = {
    'base_estimator': [LogisticRegression(C=0.1, penalty='l2', solver='liblinear'),
                       LogisticRegression(C=3.0, penalty='l1', solver='saga')],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1],
    'random_state' : [42],
}

adaboost_clf = AdaBoostClassifier()
grid_search_adaboost = GridSearchCV(adaboost_clf, param_grid, cv=5, scoring='f1')
grid_search_adaboost.fit(X_train_norm, y_train)

# Get the best parameters and the best AdaBoost model
best_params_adaboost = grid_search_adaboost.best_params_
#best_adaboost = grid_search_adaboost.best_estimator_

print(f'The best estimator is {best_params_adaboost}')

"""
# validation and results
y_pred = adaboost.predict(X_val_norm)

f1_scores = cross_val_score(model, X_val_norm, y_val, cv=5, scoring='f1')
print('F1 score: %.3f (%.3f)' % (mean(f1_scores), std(f1_scores)))

reportDict = classification_report(y_val, y_pred, output_dict=True)
f1_score = reportDict['weighted avg']['f1-score']
print("F1-score:", f1_score)
print(classification_report(y_val, y_pred))#, output_dict=True, digits=3))
print("Confusion Matrix: \n", confusion_matrix(y_val, y_pred))
"""


# test and evaluation
test_y_pred = grid_search_adaboost.predict(X_test_norm)

reportDict = classification_report(y_test, test_y_pred, output_dict=True)
f1_score = reportDict['weighted avg']['f1-score']
print("TEST F1-score:", f1_score)
print(classification_report(y_test, test_y_pred))#, output_dict=True, digits=3))
print("Confusion Matrix: \n", confusion_matrix(y_test, test_y_pred))

The best estimator is {'base_estimator': LogisticRegression(C=0.1, solver='liblinear'), 'learning_rate': 1, 'n_estimators': 150, 'random_state': 42}
TEST F1-score: 0.6205871069416729
              precision    recall  f1-score   support

           0       0.60      0.53      0.56        94
           1       0.64      0.70      0.67       113

    accuracy                           0.62       207
   macro avg       0.62      0.62      0.62       207
weighted avg       0.62      0.62      0.62       207



### Stacking

In [10]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

base_estimators = [
    ('svc', SVC(degree=10, kernel="poly")),
    ('dt', DecisionTreeClassifier()),
    ('lr', LogisticRegression(C=0.1, penalty="l2", solver="liblinear"))
]

stacking_clf = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression())

param_grid = {
    'dt__max_depth': [1, 2, 3],
    'final_estimator__C': [0.1, 1, 10]
}

grid_search_stacking = GridSearchCV(stacking_clf, param_grid, cv=5, scoring='f1')
grid_search_stacking.fit(X_train_norm, y_train)


"""
meta_model = LogisticRegression()
stacking = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stacking.fit(X_train_norm, y_train)
y_pred = stacking.predict(X_val_norm)

f1_scores = cross_val_score(model, X_val_norm, y_val, cv=5, scoring='f1')
print('CV F1 score: %.3f (%.3f)' % (mean(f1_scores), std(f1_scores)))
reportDict = classification_report(y_val, y_pred, output_dict=True)
f1_score = reportDict['weighted avg']['f1-score']
print("F1-score:", f1_score)
print(classification_report(y_val, y_pred))#, output_dict=True, digits=3))
print("Confusion Matrix: \n", confusion_matrix(y_val, y_pred))
"""

# Get the best parameters and the best Stacking model
best_params_stacking = grid_search_stacking.best_params_
#best_stacking = grid_search_stacking.best_estimator_

print(f'The best estimator is {best_params_stacking}')

# Make predictions using the best model
test_y_pred = grid_search_stacking.predict(X_test_norm)

reportDict = classification_report(y_test, test_y_pred, output_dict=True)
f1_score = reportDict['weighted avg']['f1-score']
print("TEST F1-score:", f1_score)
print(classification_report(y_test, test_y_pred))#, output_dict=True, digits=3))
print("Confusion Matrix: \n", confusion_matrix(y_test, test_y_pred))

The best estimator is {'dt__max_depth': 3, 'final_estimator__C': 10}
TEST F1-score: 0.6497727141000058
              precision    recall  f1-score   support

           0       0.63      0.56      0.60        94
           1       0.67      0.73      0.69       113

    accuracy                           0.65       207
   macro avg       0.65      0.64      0.65       207
weighted avg       0.65      0.65      0.65       207

