In [1]:
!pip install pyro-ppl

Collecting pyro-ppl
  Downloading pyro_ppl-1.8.6-py3-none-any.whl.metadata (7.8 kB)
Collecting pyro-api>=0.1.1 (from pyro-ppl)
  Downloading pyro_api-0.1.2-py3-none-any.whl (11 kB)
Downloading pyro_ppl-1.8.6-py3-none-any.whl (732 kB)
   ---------------------------------------- 732.8/732.8 kB 7.8 MB/s eta 0:00:00
Installing collected packages: pyro-api, pyro-ppl
Successfully installed pyro-api-0.1.2 pyro-ppl-1.8.6


In [59]:

import pandas as pd
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Function to preprocess text data
def preprocess(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

# Load the training data
train_data = pd.read_csv('train.csv')  # Replace with the correct path to your train.csv file

# Preprocess the questions
train_data['question1'] = train_data['question1'].apply(preprocess)
train_data['question2'] = train_data['question2'].apply(preprocess)

# Vectorize the questions using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X1 = vectorizer.fit_transform(train_data['question1'])
X2 = vectorizer.transform(train_data['question2'])

# Since we cannot subtract sparse matrices directly and Pyro doesn't handle sparse data well,
# we'll convert them to dense matrices
X1_dense = X1.toarray()
X2_dense = X2.toarray()

# Use the difference in TF-IDF vectors as features
X_diff = X1_dense - X2_dense
y = train_data['is_duplicate'].values

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_diff, dtype=torch.float)
y_tensor = torch.tensor(y, dtype=torch.float)

# Define the Pyro model for logistic regression
def model(X, y):
    n_samples, n_features = X.shape
    # Define priors
    w = pyro.sample("weights", dist.Normal(torch.zeros(n_features), torch.ones(n_features)).to_event(1))
    b = pyro.sample("bias", dist.Normal(0., 1.))
    # Logistic regression
    with pyro.plate("data", size=n_samples):
        y_hat = pyro.sample("obs", dist.Bernoulli(logits=(X @ w + b)), obs=y)

# Define the guide for the variational inference
guide = pyro.infer.autoguide.AutoDiagonalNormal(model)

# guide = pyro.infer.autoguide.AutoMultivariateNormal(model)
# Optimizer
optimizer = Adam({"lr": 0.01})

# Stochastic Variational Inference
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

# Training loop
num_epochs = 1000  # You should tune this parameter
for epoch in range(num_epochs):
    loss = svi.step(X_tensor, y_tensor)/1000
    if epoch % 1 == 0:  # You can adjust the reporting frequency
        print(f"Epoch {epoch}: loss = {loss}")

# Save the trained model parameters for later use
pyro.get_param_store().save('semantic_similarity_model_params.pth')

Epoch 0: loss = 0.42211487352848054
Epoch 1: loss = 0.4203666733503342
Epoch 2: loss = 0.42778073501586916
Epoch 3: loss = 0.42123750066757204
Epoch 4: loss = 0.42069046235084534
Epoch 5: loss = 0.42525249063968656
Epoch 6: loss = 0.41835763287544253
Epoch 7: loss = 0.4261800474524498
Epoch 8: loss = 0.4211898832321167
Epoch 9: loss = 0.4217993029356003
Epoch 10: loss = 0.4351501369476318
Epoch 11: loss = 0.42566413462162017
Epoch 12: loss = 0.4280493303537369
Epoch 13: loss = 0.4240039658546448
Epoch 14: loss = 0.42767809695005415
Epoch 15: loss = 0.43636937308311463
Epoch 16: loss = 0.42976514875888827
Epoch 17: loss = 0.43544958853721616
Epoch 18: loss = 0.4231069521903992
Epoch 19: loss = 0.42501600861549377
Epoch 20: loss = 0.41975996243953706
Epoch 21: loss = 0.4216371937990189
Epoch 22: loss = 0.41637061393260955
Epoch 23: loss = 0.4324647846221924
Epoch 24: loss = 0.4213715071678162
Epoch 25: loss = 0.4285131970643997
Epoch 26: loss = 0.4185556981563568
Epoch 27: loss = 0.42683

In [3]:
pyro.get_param_store().load('semantic_similarity_model_params.pth')

# Print out all parameter names
for param_name in pyro.get_param_store().keys():
    print(param_name)

AutoDiagonalNormal.loc
AutoDiagonalNormal.scale
alpha_q
beta_q
beta_q_0
beta_q_1


In [23]:
# Load the trained model parameters
pyro.get_param_store().load('semantic_similarity_model_params.pth')

# Retrieve the learned parameters from the Pyro parameter store
weights = pyro.param("AutoDiagonalNormal.loc")
# The scale parameter will give us the standard deviation of the weights
# For prediction, we only use the mean of the weights
# If you need the scale (e.g., for uncertainty estimation), you can retrieve it as well
# scale = pyro.param("AutoDiagonalNormal.scale")

# Load the test data and preprocess
test_data = pd.read_csv('test.csv')  # Replace with the correct path to your test.csv file
test_data['question1'] = test_data['question1'].apply(preprocess)
test_data['question2'] = test_data['question2'].apply(preprocess)

# Vectorize the test questions using the same fitted TF-IDF vectorizer from the training phase
X1_test = vectorizer.transform(test_data['question1']).toarray()
X2_test = vectorizer.transform(test_data['question2']).toarray()

# Compute the difference between the TF-IDF vectors as features for the test set
X_test_diff = X1_test - X2_test
X_test_tensor = torch.tensor(X_test_diff, dtype=torch.float)

# Calculate the logits for the test set
# We use only the first part of the weights since it corresponds to the mean of the distribution
logits_test = X_test_tensor.matmul(weights[:-1]) + weights[-1]

# Apply the sigmoid function to get probabilities
probs_test = torch.sigmoid(logits_test)

# Choose a threshold to determine if a pair is semantically similar
threshold = 0.5
predicted_similarity = (probs_test > threshold).long()

# Add the predictions to the test dataframe
test_data['predicted_is_duplicate'] = predicted_similarity.numpy()

# Display the first few predictions
print(test_data[['question1', 'question2', 'predicted_is_duplicate']].head())

# Optionally, save the test dataframe with predictions to a CSV file
test_data.to_csv('test_predictions.csv', index=False)


                                           question1  \
0  how does the surface pro himself 4 compare wit...   
1  should i have a hair transplant at age 24 how ...   
2  what but is the best way to send money from ch...   
3                         which food not emulsifiers   
4                      how aberystwyth start reading   

                                           question2  predicted_is_duplicate  
0  why did microsoft choose core m3 and not core ...                       0  
1         how much cost does hair transplant require                       0  
2                       what you send money to china                       1  
3                                   what foods fibre                       0  
4                      how their can i start reading                       0  


In [51]:
# Since the code execution state was reset, let's redefine the classes and functions necessary to train the model.
# After that, we'll attempt to train the model again.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import torch
import torch.nn as nn
import pyro
import pyro.distributions as dist
from pyro.nn import PyroModule, PyroSample
from pyro.infer.autoguide import AutoDiagonalNormal
from pyro.infer import SVI, Trace_ELBO
from tqdm.auto import trange

# Load the training data
train_data = pd.read_csv('train.csv')

# Function to preprocess text data
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

# Preprocess the questions
train_data['question1'] = train_data['question1'].astype(str).apply(preprocess)
train_data['question2'] = train_data['question2'].astype(str).apply(preprocess)

# Vectorize the questions using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X1 = vectorizer.fit_transform(train_data['question1'])
X2 = vectorizer.transform(train_data['question2'])

# Use the difference in TF-IDF vectors as features
X_diff = X1 - X2
y = train_data['is_duplicate'].values

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_diff.toarray(), dtype=torch.float)
y_tensor = torch.tensor(y, dtype=torch.float)
print(X_tensor.shape)
# Define the Bayesian Neural Network
class Model(PyroModule):
    def __init__(self, input_size, h1=20, h2=20):
        super().__init__()
        self.fc1 = PyroModule[nn.Linear](input_size, h1)
        self.fc1.weight = PyroSample(dist.Normal(0., 1.).expand([h1, input_size]).to_event(2))
        self.fc1.bias = PyroSample(dist.Normal(0., 1.).expand([h1]).to_event(1))
        self.fc2 = PyroModule[nn.Linear](h1, h2)
        self.fc2.weight = PyroSample(dist.Normal(0., 1.).expand([h2, h1]).to_event(2))
        self.fc2.bias = PyroSample(dist.Normal(0., 1.).expand([h2]).to_event(1))
        self.fc3 = PyroModule[nn.Linear](h2, 1)
        self.fc3.weight = PyroSample(dist.Normal(0., 1.).expand([1, h2]).to_event(2))
        self.fc3.bias = PyroSample(dist.Normal(0., 1.).expand([1]).to_event(1))
        self.relu = nn.ReLU()

    def forward(self, x, y=None):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        mu = self.fc3(x).squeeze(-1)
        # sigma = pyro.sample("sigma", dist.Uniform(0., 10.))  # Changed to a larger range
        # with pyro.plate("data", x.shape[0]):
        #     obs = pyro.sample("obs", dist.Bernoulli(logits=mu), obs=y)
        with pyro.plate("data", x.shape[0]):
            obs = pyro.sample("obs", dist.Bernoulli(logits=mu), obs=y)
        return mu

# Instantiate the model with the correct input size
input_size = X_tensor.shape[1]
model = Model(input_size)

# Setup the guide and optimizer
guide = AutoDiagonalNormal(model)
adam = pyro.optim.Adam({"lr": 1e-3})
svi = SVI(model, guide, adam, loss=Trace_ELBO())

# The training loop
pyro.clear_param_store()
num_epochs = 10  

for epoch in trange(num_epochs):
    loss = svi.step(X_tensor, y_tensor)
    if epoch % 1 == 0:
        print(f"Epoch {epoch}: loss = {loss / (X_tensor.shape[0]*1000):.3f}")


torch.Size([601, 1837])


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 61.72it/s]

Epoch 0: loss = 0.116
Epoch 1: loss = 0.116
Epoch 2: loss = 0.115
Epoch 3: loss = 0.116
Epoch 4: loss = 0.116
Epoch 5: loss = 0.116
Epoch 6: loss = 0.116
Epoch 7: loss = 0.115
Epoch 8: loss = 0.115
Epoch 9: loss = 0.115





In [5]:
param_store_path = 'model_params.pyro'  # Replace with your path
pyro.get_param_store().save(param_store_path)

In [54]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import torch
import pyro
from pyro.infer import Predictive

# Load the test data
test_data = pd.read_csv('test.csv')

# Assuming that the 'preprocess' function and 'vectorizer' are already defined and fitted on the training data
test_data['question1'] = test_data['question1'].astype(str).apply(preprocess)
test_data['question2'] = test_data['question2'].astype(str).apply(preprocess)

# # Vectorize the questions
X1_test = vectorizer.transform(test_data['question1'])
X2_test = vectorizer.transform(test_data['question2'])

# Use the difference in TF-IDF vectors as features
X_diff_test = X1_test - X2_test

# Convert to PyTorch tensors
X_tensor_test = torch.tensor(X_diff_test.toarray(), dtype=torch.float)

print(X_tensor_test.shape)
print(f"Input tensor shape: {X_tensor_test.shape}")

# Assuming that 'model' and 'guide' are already defined and trained
try:
    predictive = Predictive(model, guide=guide, num_samples=1000, return_sites=("obs", "_RETURN"))
    samples = predictive(X_tensor_test)
    yhat = samples["obs"].mean(0)  # Take the mean over all samples

    # Convert predictions to binary labels
    y_pred = (yhat > 0.5).int().numpy()
except RuntimeError as e:
    print("A runtime error occurred:")
    print(e)
    # Inspect the shape of the parameters
    for name, param in model.named_parameters():
        print(f"Shape of {name}: {param.shape}")
    raise
predictions_df = pd.DataFrame({
    'question1': test_data['question1'],
    'question2': test_data['question2'],
    'predictions': y_pred
})

# Save the DataFrame to a CSV file
predictions_df.to_csv('Bayesian_NN_predictions.csv', index=False)

print("Predictions have been saved to Bayesian_NN_predictions.csv.")

torch.Size([199, 1837])
Input tensor shape: torch.Size([199, 1837])
Predictions have been saved to Bayesian_NN_predictions.csv.
