In [68]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import re
import tqdm
import pickle

In [None]:
# Load dataset
data = pd.read_csv("train.csv")  # Replace with the dataset path

# Keep only necessary columns
data = data[['question1', 'question2', 'is_duplicate']]

# Drop missing values
data.dropna(inplace=True)

# Check the dataset
print(data.head())

                                           question1  \
0  What is the step by step guide to invest in sh...   
1  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2  How can I increase the speed of my internet co...   
3  Why am I mentally very lonely? How can I solve...   
4  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit the vectorizer on both questions
questions = data['question1'].tolist() + data['question2'].tolist()
vectorizer.fit(questions)

# Compute TF-IDF vectors for question1 and question2
q1_vectors = vectorizer.transform(data['question1'])
q2_vectors = vectorizer.transform(data['question2'])

# Cosine similarity between question pairs
cosine_sim = [cosine_similarity(q1, q2)[0][0] for q1, q2 in zip(q1_vectors, q2_vectors)]

# Jaccard similarity
def jaccard_similarity(q1, q2):
    q1_set = set(q1.split())
    q2_set = set(q2.split())
    intersection = len(q1_set.intersection(q2_set))
    union = len(q1_set.union(q2_set))
    return intersection / union if union != 0 else 0

jaccard_sim = [jaccard_similarity(q1, q2) for q1, q2 in zip(data['question1'], data['question2'])]

# Word overlap


In [None]:
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
UNI_BLEU_WEIGHTS = (1, 0, 0, 0)
BI_BLEU_WEIGHTS = (0, 1, 0, 0)
BLEU2_WEIGHTS = (0.5, 0.5, 0, 0)

def tokenizer(sentence):
    """Very basic tokenizer: split the sentence by space into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
      words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
    return [w for w in words if w]



def char_ngram_tokenizer(sentence, n):
    """Character ngram tokenizer: split the sentence into a list of char ngram tokens."""
    return [sentence[i:i+n] for i in range(len(sentence)-n+1)]

In [97]:
overlap = []
q1_word_count = []
q2_word_count = []
word_count_diff = []
uni_BLEU = []
bi_BLEU = []
BLEU2 = []
char_bigram_overlap = []
char_trigram_overlap = []
char_4gram_overlap = []

for q1, q2 in tqdm.tqdm(zip(data['question1'], data['question2'])):
    t1 = tokenizer(q1)
    t2 = tokenizer(q2)

    q1_word_count.append(len(t1))
    q2_word_count.append(len(t2))
    word_count_diff.append(abs(len(t1) - len(t2)))
    char_bigram_overlap.append(len(set(char_ngram_tokenizer(q1, 2)).intersection(
             set(char_ngram_tokenizer(q2, 2)))))
    char_trigram_overlap.append(len(set(char_ngram_tokenizer(q1, 3)).intersection(
             set(char_ngram_tokenizer(q2, 3)))))
    char_4gram_overlap.append(len(set(char_ngram_tokenizer(q1, 4)).intersection(
             set(char_ngram_tokenizer(q2, 4)))))

    overlap.append(len(set(q1.lower().split()).intersection(
             set(q2.lower().split()))))
    
    s_function = SmoothingFunction()
    uni_BLEU.append(sentence_bleu([t2],
                         t1,
                         weights=UNI_BLEU_WEIGHTS,
                         smoothing_function=s_function.method2))
    
    bi_BLEU.append(sentence_bleu([t2],
                         t1,
                         weights=BI_BLEU_WEIGHTS,
                         smoothing_function=s_function.method2))
    
    BLEU2.append(sentence_bleu([t2],
                         t1,
                         weights=BLEU2_WEIGHTS,
                         smoothing_function=s_function.method2))

404287it [04:00, 1679.09it/s]


In [102]:

features = pd.DataFrame({
    'cosine_similarity': cosine_sim,
    'jaccard_similarity': jaccard_sim,
    'word_overlap': overlap,
    'q1_word_count': q1_word_count,
    'q2_word_count': q2_word_count,
    'word_count_diff': word_count_diff,
    'char_bigram_overlap': char_bigram_overlap,
    'char_trigram_overlap': char_trigram_overlap,
    'char_4gram_overlap': char_4gram_overlap,
    'uni_BLEU': uni_BLEU,
    'bi_BLEU': bi_BLEU,
    'BLEU2': BLEU2
})

labels = data['is_duplicate'].values

In [103]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42)

print(f"Train set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Train set size: (363858, 12)
Test set size: (40429, 12)


In [104]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=500, criterion='entropy',
                                   max_depth=10, min_samples_leaf=1,
                                   max_features=0.4, n_jobs=3)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

In [105]:
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.728165425808207
Precision: 0.607428327826921
Recall: 0.7588203967514312
F1-Score: 0.6747365928732094


In [18]:
np.savez("q_vectors", q1_vectors=q1_vectors, q2_vectors=q2_vectors)

In [None]:
# Load the .npz file
loaded = np.load('q_vectors.npz', allow_pickle=True)

# Access the arrays by their names
print(loaded['q1_vectors'])
print(loaded['q2_vectors'])


In [100]:
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print("Model saved to random_forest_model.pkl")

KeyboardInterrupt: 

In [101]:
features.to_csv("features12.csv", sep=",", index=False, header=True)
