In [1]:
# pip install -U sentence-transformers

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import joblib
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
# Load the data
data = pd.read_csv('train.csv')
validation_data = pd.read_csv('dev.csv')

# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
def generate_embeddings(dataframe, model):
    # Generate embeddings for premises and hypotheses separately
    premise_embeddings = model.encode(dataframe['premise'].tolist(), show_progress_bar=True)
    hypothesis_embeddings = model.encode(dataframe['hypothesis'].tolist(), show_progress_bar=True)
    
    # Combine the two sets of embeddings into one feature set
    embeddings = np.concatenate((premise_embeddings, hypothesis_embeddings), axis=1)
    
    return embeddings

In [4]:
# Generate embeddings for training and validation data
train_embeddings = generate_embeddings(data, model)
validation_embeddings = generate_embeddings(validation_data, model)

# Prepare the labels
train_labels = data['label'].values
validation_labels = validation_data['label'].values

Batches:   0%|          | 0/842 [00:00<?, ?it/s]

Batches:   0%|          | 0/842 [00:00<?, ?it/s]

Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Batches:   0%|          | 0/211 [00:00<?, ?it/s]

In [5]:
# Define the base learners
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)),
    ('svm', make_pipeline(StandardScaler(), SVC(kernel='rbf', probability=True, verbose=True))),
    ('logreg', make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, solver='saga', random_state=42, verbose=1)))
]

In [6]:
# Define the Stacking Classifier
# final_estimator = LogisticRegression(penalty='l2', max_iter=500, solver='saga', random_state=42, verbose=1)
final_estimator = GradientBoostingClassifier(n_estimators=100, random_state=42, verbose=1)
stacking_classifier = StackingClassifier(estimators=base_learners, final_estimator=final_estimator, n_jobs=-1)

In [None]:
# Train the Stacking Classifier
stacking_classifier.fit(train_embeddings, train_labels)

In [None]:
# Save the trained stacking classifier to a file
model_filename = 'ensembleNLI10.joblib'
joblib.dump(stacking_classifier, model_filename)

In [13]:
# Evaluate the Stacking Classifier on the validation data
validation_predictions = stacking_classifier.predict(validation_embeddings)


accuracy = accuracy_score(validation_labels, validation_predictions)
report = classification_report(validation_labels, validation_predictions)
print(f'Validation Accuracy: {accuracy}')
print(report)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


Validation Accuracy: 0.6936321804957696
              precision    recall  f1-score   support

           0       0.71      0.63      0.66      3259
           1       0.68      0.76      0.72      3478

    accuracy                           0.69      6737
   macro avg       0.70      0.69      0.69      6737
weighted avg       0.69      0.69      0.69      6737



In [14]:
predictions_df = pd.DataFrame(validation_predictions, columns=['prediction'])
predictions_df.to_csv('validation_predictions.csv', index=False)