In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('Quora_QP_Train.csv')


In [4]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


## Doing preprocessing on the dataset of quora

In [5]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q

In [6]:
preprocess("I've already! wasn't <b>done</b>?")


'i have already  was not done'

In [7]:
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)

## Basic Feature engineering

In [8]:
## feature to get the length of question
df['q1_len'] = df['question1'].str.len() 
df['q2_len'] = df['question2'].str.len()

In [9]:
## feature getting the number of words from each question
df['q1_num_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
df['q2_num_words'] = df['question2'].apply(lambda row: len(row.split(" ")))

In [10]:
## function to get the number of common words that are unique in both q1 and q2
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)

In [11]:
## applying above function
df['word_common'] = df.apply(common_words, axis=1)


In [12]:
## function to get the total no. of unique words combine from q1 and q2
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))


In [13]:
## applying above function 
df['word_total'] = df.apply(total_words, axis=1)


In [14]:
df['word_share'] = round(df['word_common']/df['word_total'],2)


## Performing advance feature engineering( token, length-based,fuzzy)

In [15]:
from nltk.corpus import stopwords

def fetch_token_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [16]:
## Applying the above function
token_features = df.apply(fetch_token_features, axis=1)

df["cwc_min"]       = list(map(lambda x: x[0], token_features))
df["cwc_max"]       = list(map(lambda x: x[1], token_features))
df["csc_min"]       = list(map(lambda x: x[2], token_features))
df["csc_max"]       = list(map(lambda x: x[3], token_features))
df["ctc_min"]       = list(map(lambda x: x[4], token_features))
df["ctc_max"]       = list(map(lambda x: x[5], token_features))
df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))

In [17]:
from rapidfuzz import fuzz

def fetch_fuzzy_features(row):
    q1 = str(row['question1'])
    q2 = str(row['question2'])

    return [
        fuzz.QRatio(q1, q2),
        fuzz.partial_ratio(q1, q2),
        fuzz.token_sort_ratio(q1, q2),
        fuzz.token_set_ratio(q1, q2)
    ]


In [18]:
from tqdm import tqdm
tqdm.pandas()

fuzzy_scores = df.progress_apply(fetch_fuzzy_features, axis=1, result_type='expand')
fuzzy_scores.columns = ['fuzz_ratio', 'fuzz_partial_ratio', 'token_sort_ratio', 'token_set_ratio']

df = pd.concat([df, fuzzy_scores], axis=1)

100%|██████████| 404290/404290 [00:46<00:00, 8635.32it/s] 


In [19]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,65,56,14,12,...,0.999983,0.999983,0.916659,0.785709,0.0,1.0,92.561983,100.0,92.561983,100.0
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,50,87,12,17,...,0.749981,0.599988,0.699993,0.466664,0.0,1.0,65.693431,77.083333,63.157895,85.714286
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,72,58,14,10,...,0.399992,0.249997,0.399996,0.285712,0.0,1.0,53.846154,56.565657,66.153846,66.153846
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0,49,58,12,16,...,0.0,0.0,0.0,0.0,0.0,0.0,35.514019,38.77551,37.254902,36.170213
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,75,38,15,7,...,0.99995,0.666644,0.57142,0.30769,0.0,1.0,46.017699,64.615385,46.846847,66.666667


## Performing basic EDA on the data to get the insights about new feature and their mutual relations

In [20]:
## making seperate df for questions
ques_df = df[['question1','question2']]
ques_df

Unnamed: 0,question1,question2
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...
1,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...
2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...
3,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...
4,which one dissolve in water quikly sugar salt...,which fish would survive in salt water
...,...,...
404285,how many keywords are there in the racket prog...,how many keywords are there in perl programmin...
404286,do you believe there is life after death,is it true that there is life after death
404287,what is one coin,what is this coin
404288,what is the approx annual cost of living while...,i am having little hairfall problem but i want...


In [21]:
## droping irrelevant columns from the df
final_df = df.drop(columns=['id','qid1','qid2','question1','question2'])
print(final_df.shape)
final_df.head()

(404290, 20)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,0,65,56,14,12,11,23,0.48,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,92.561983,100.0,92.561983,100.0
1,0,50,87,12,17,8,26,0.31,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,65.693431,77.083333,63.157895,85.714286
2,0,72,58,14,10,4,24,0.17,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,53.846154,56.565657,66.153846,66.153846
3,0,49,58,12,16,1,22,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.514019,38.77551,37.254902,36.170213
4,0,75,38,15,7,4,21,0.19,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,46.017699,64.615385,46.846847,66.666667


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

# Reset + clean
ques_df = ques_df.dropna(subset=['question1', 'question2']).reset_index(drop=True)

# Combine question1 and question2
questions = list(ques_df['question1']) + list(ques_df['question2'])

# Fit vectorizer
cv = CountVectorizer(max_features=3000)
X = cv.fit_transform(questions)  # sparse (800,000, 3000)

# Split back
half = X.shape[0] // 2
q1_sparse = X[:half]
q2_sparse = X[half:]

# Combine sparse horizontally
bow_sparse = hstack([q1_sparse, q2_sparse])  # (N, 6000)

# Convert sparse to DataFrame safely (in chunks if needed)
bow_df = pd.DataFrame.sparse.from_spmatrix(bow_sparse)

In [23]:
bow_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
## creating final_df from existing df an dbowdf on which we can apply our model
final_df = pd.concat([final_df.reset_index(drop=True), bow_df], axis=1)

In [62]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Columns: 6020 entries, is_duplicate to 5999
dtypes: Sparse[int64, 0](6000), float64(13), int64(7)
memory usage: 143.5 MB


In [26]:
## splitting into train_test
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, hstack
# 1. Separate target
y = final_df.iloc[:, 0].values  # target (first column)
# 2. Separate other features (if any, excluding the target column and BoW)
# If you had features like fuzzy scores, extract them here
dense_features = final_df.iloc[:, 1:19].values  # assuming 22 non-BoW features
# 3. Combine dense and sparse features
X = hstack([csr_matrix(dense_features), bow_sparse])  # shape: (n, 6023)
# 4. Split without converting to dense
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [33]:
## training model and making prediction
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)


In [31]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(y_test, y_pred)

0.7271513022830147

In [29]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.




In [35]:
## trying xgboostclassifier 
from xgboost import XGBClassifier

xgb = XGBClassifier(tree_method='hist', use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test, y_pred)

0.8054737935640258

## The accuracy(80.547%) is good so far, Now doing hyperparameter tunning in order to get much better prediction

In [36]:
## randomized tunning
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, log_loss
import numpy as np

# Base model
xgb = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='logloss',
    verbosity=0
)

# Parameter distributions to sample from
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,                 # Number of combinations to try
    scoring='neg_log_loss',   # Use 'accuracy' if you prefer
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit
random_search.fit(X_train, y_train)

# Best model
best_xgb = random_search.best_estimator_

# Evaluation
y_pred = best_xgb.predict(X_test)
y_proba = best_xgb.predict_proba(X_test)

print("Best Parameters (Randomized):", random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Log Loss:", log_loss(y_test, y_proba))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters (Randomized): {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 1, 'colsample_bytree': 1.0}
Accuracy: 0.8150832323332261
Log Loss: 0.37921089166331484


In [37]:
## Re-Trinimg the model based on parameter afetr hyper parameter tunning
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, log_loss

# Use best params from RandomizedSearchCV
xgb_final = XGBClassifier(
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='logloss',
    subsample=0.6,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.2,
    gamma=1,
    colsample_bytree=1.0
)

# Fit on full training data
xgb_final.fit(X_train, y_train)

# Predictions
y_pred = xgb_final.predict(X_test)
y_proba = xgb_final.predict_proba(X_test)

# Evaluation
print("Final Accuracy:", accuracy_score(y_test, y_pred))
print("Final Log Loss:", log_loss(y_test, y_proba))


Final Accuracy: 0.8150832323332261
Final Log Loss: 0.37921089166331484


## so far we are done with making model now checking its prediction on new set of question but before that we have to make the input of (1,6019) dimension

In [38]:
def test_common_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return len(w1 & w2)

In [39]:
def test_total_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return (len(w1) + len(w2))

In [82]:
def test_fetch_token_features(q1,q2):
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [107]:
def test_fetch_fuzzy_features(q1, q2):
    return [
        fuzz.QRatio(q1, q2),
        fuzz.partial_ratio(q1, q2),
        fuzz.token_sort_ratio(q1, q2)
    ]

In [120]:
from scipy.sparse import hstack, csr_matrix
from fuzzywuzzy import fuzz

def query_point_creator(q1, q2):
    input_query = []

    # Preprocess
    q1 = preprocess(q1)
    q2 = preprocess(q2)

    # Basic handcrafted features (7)
    input_query.append(len(q1))                        # q1_len
    input_query.append(len(q2))                        # q2_len
    input_query.append(len(q1.split()))                # q1_num_words
    input_query.append(len(q2.split()))                # q2_num_words
    wc = test_common_words(q1, q2)
    wt = test_total_words(q1, q2)
    input_query.append(wc)                             # word_common
    input_query.append(wt)                             # word_total
    input_query.append(round(wc / wt, 2) if wt else 0) # word_share

    # Token-based features (8)
    token_features = test_fetch_token_features(q1, q2)  # returns 8
    assert len(token_features) == 8
    input_query.extend(token_features)

    # Fuzzy features (3)
    fuzzy_features = [
        fuzz.QRatio(q1, q2),
        fuzz.partial_ratio(q1, q2),
        fuzz.token_sort_ratio(q1, q2)
    ]
    input_query.extend(fuzzy_features)

    # ✅ Final handcrafted features count must be 18
    assert len(input_query) == 18, f"Got {len(input_query)} features instead of 18"

    # Bag of Words
    q1_bow = cv.transform([q1])
    q2_bow = cv.transform([q2])

    return hstack([
        csr_matrix(np.array(input_query).reshape(1, -1)),  # (1, 18)
        q1_bow,                                            # (1, 3000)
        q2_bow                                             # (1, 3000)
    ])



In [105]:
print(final_df.columns[1:19])

Index(['q1_len', 'q2_len', 'q1_num_words', 'q2_num_words', 'word_common',
       'word_total', 'word_share', 'cwc_min', 'cwc_max', 'csc_min', 'csc_max',
       'ctc_min', 'ctc_max', 'last_word_eq', 'first_word_eq', 'fuzz_ratio',
       'fuzz_partial_ratio', 'token_sort_ratio'],
      dtype='object')


In [121]:
query_vector = query_point_creator("How do I learn Python?", "What is the best way to learn Python?")
print(query_vector.shape)  # Should be (1, 6018)



(1, 6018)


In [129]:
q1 = 'What is the capital of India?'
q2 = 'What is the current capital of Pakistan?'
q3 = 'Where is the capital of india?'

In [132]:
xgb_final.predict(query_point_creator(q1,q3))

array([1])

In [133]:
import pickle

pickle.dump(xgb_final,open('model.pkl','wb'))
pickle.dump(cv,open('cv.pkl','wb'))