In [None]:
!nvidia-smi

Sun Nov 24 13:47:51 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 556.13                 Driver Version: 556.13         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   37C    P8              6W /   30W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import joblib

In [3]:
nltk.download('punkt')
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print("Loading dataset......") #displaying dataset
data = pd.read_csv('questions.csv')
data = data.dropna(subset=["question1", "question2"])
print(data.head())
print(f"Total samples in dataset: {len(data)}")

Loading dataset......
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  
Total samples in dataset: 404348


In [5]:
#preprocessing the text
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    elif isinstance(text, float):
        if text.is_integer():
          return str(int(text))
        else:
          return ""
    else:
      tokens = nltk.word_tokenize(text.lower())
      stop_words = set(stopwords.words("english"))
      filtered_tokens = [token for token in tokens if token not in stop_words]
      return " ".join(filtered_tokens)
print("Preprocessing Text......")
data["question1"] = data["question1"].apply(preprocess_text)
data["question2"] = data["question2"].apply(preprocess_text)

Preprocessing Text......


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from tensorflow.keras.callbacks import EarlyStopping

#vectorizing text
vectorizer = TfidfVectorizer(max_features = 10000)
question1_vectors = vectorizer.fit_transform(data["question1"])
question2_vectors = vectorizer.transform(data["question2"])

# combining question 1 and question 2 vectors
x = hstack((question1_vectors, question2_vectors))

y = data["is_duplicate"].values


# train-test splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)


In [7]:
print(f"Vectorized Sentence1: {question1_vectors.shape}")
print(f"Vectorized Sentence2: {question2_vectors.shape}")

Vectorized Sentence1: (404348, 10000)
Vectorized Sentence2: (404348, 10000)


In [8]:
print(x.shape)
print(y.shape)

(404348, 20000)
(404348,)


In [22]:
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam

# Building a model(feed-forward neural network)// later try using transformer varient
print("Building Model......")
model = Sequential([
    Dense(512, activation = "relu", input_shape = (x_train.shape[1],)),
    Dropout(0.5),
    BatchNormalization(),
    Dense(256, activation = "relu"),
    Dropout(0.5),
    BatchNormalization(),
    Dense(128, activation = "relu"),
    Dropout(0.5),
    BatchNormalization(),
    Dense(64, activation = "relu"),
    Dense(32, activation = "relu"),
    Dense(1, activation = "sigmoid")
])
model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# training a model
print("Training Model......")
model.fit(x_train, y_train, batch_size=256, epochs=50, validation_data=(x_test, y_test), callbacks=[early_stopping])

Building Model......
Training Model......
Epoch 1/50
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m487s[0m 379ms/step - accuracy: 0.7107 - loss: 0.5678 - val_accuracy: 0.7902 - val_loss: 0.4585
Epoch 2/50
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m472s[0m 373ms/step - accuracy: 0.8178 - loss: 0.3969 - val_accuracy: 0.8051 - val_loss: 0.4279
Epoch 3/50
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m423s[0m 335ms/step - accuracy: 0.8652 - loss: 0.3020 - val_accuracy: 0.8130 - val_loss: 0.4479
Epoch 4/50
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 328ms/step - accuracy: 0.8956 - loss: 0.2449 - val_accuracy: 0.8174 - val_loss: 0.4791
Epoch 5/50
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m417s[0m 330ms/step - accuracy: 0.9125 - loss: 0.2112 - val_accuracy: 0.8181 - val_loss: 0.4808
Epoch 6/50
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m416s[0m 329ms/step - accuracy: 0.9245 - loss:

<keras.src.callbacks.history.History at 0x1d7084fcda0>

In [25]:
print({accuracy})

{0.8050698652157784}


In [37]:
# Evaluating the model
y_pred = (model.predict(x_test) > 0.5).astype("int").flatten()
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}.2f")

# Saving trained model
model.save("paraphrase_model.h5")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")# saving the trained TF-IDF vectorizer

print("Model saved successfully as paraphrase_model.h5")

[1m2528/2528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 11ms/step




Accuracy: 0.8050698652157784.2f
Model saved successfully as paraphrase_model.h5


In [34]:
for idx, (actual, predicted) in enumerate(zip(y_test[:10], y_pred[:10])):  # Display first 10 samples
    question1 = data.iloc[x_test.indices[idx]]["question1"]
    question2 = data.iloc[x_test.indices[idx]]["question2"]

    print(f"Input Question1: {question1}")
    print(f"Input Question2: {question2}")
    print(f"Actual: {actual}, Predicted: {predicted}")

Input Question1: government regulate internet based services ?
Input Question2: internet governance us-centric ?
Actual: 0, Predicted: 0
Input Question1: long take get offer ?
Input Question2: long take get offer netflix ?
Actual: 1, Predicted: 1
Input Question1: happens fall love professor ?
Input Question2: professor , happens fall love student ?
Actual: 0, Predicted: 0
Input Question1: come family friends ?
Input Question2: come friends ?
Actual: 1, Predicted: 1
Input Question1: tree data structure ?
Input Question2: b * tree data structure ?
Actual: 0, Predicted: 0
Input Question1: illegal report crime ?
Input Question2: legal report crime ?
Actual: 0, Predicted: 0
Input Question1: happen india pakistan gets war win ?
Input Question2: war occurs india pakistan effect rest country ?
Actual: 1, Predicted: 0
Input Question1: good solar panel installation provider templeton , california ca ?
Input Question2: good solar panel installation provider terra bella , california ca ?
Actual: 1