In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from joblib import load
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure all necessary NLTK data is downloaded
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

# Load the pre-trained models
tfidf_vectorizer = load('tfidf_vectorizer.joblib')
svd_model = load('svd_model.joblib')
gb_model = load('gbmodel.joblib')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def expand_contractions(text):
    # Dictionary of English contractions
    contractions_dict = {"don't": "do not", "doesn't": "does not", "didn't": "did not",
                         }
    # Regular expression for finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    def replace(match):
        return contractions_dict[match.group(0)]

    return contractions_re.sub(replace, text)

#function to handle LaTeX expressions
def clean_math_text(text):

    replacements = {
        # Basic operations and structures
        r'\\frac\{(.*?)\}\{(.*?)\}': r'\1 over \2',
        r'\\sqrt\{(.*?)\}': r'square root of \1',
        r'\\sum_(\{.*?\})\^(\{.*?\})': r'sum from \1 to \2',
        r'\\int_(\{.*?\})\^(\{.*?\})': r'integral from \1 to \2',
        r'\\log_(\{.*?\})\{(.*?)\}': r'log base \1 of \2',
        r'\\lim_(\{.*?\})': r'limit as \1',
        r'(\d+)\^(\{?\d+\}?)': r'\1 to the power of \2',
        r'\\infty': 'infinity',
        r'\\pm': 'plus or minus',
        # Greek letters
        r'\\alpha': 'alpha', r'\\beta': 'beta', r'\\gamma': 'gamma',
        r'\\delta': 'delta', r'\\epsilon': 'epsilon', r'\\zeta': 'zeta',
        r'\\eta': 'eta', r'\\theta': 'theta', r'\\iota': 'iota',
        r'\\kappa': 'kappa', r'\\lambda': 'lambda', r'\\mu': 'mu',
        r'\\nu': 'nu', r'\\xi': 'xi', r'\\omicron': 'omicron',
        r'\\pi': 'pi', r'\\rho': 'rho', r'\\sigma': 'sigma',
        r'\\tau': 'tau', r'\\upsilon': 'upsilon', r'\\phi': 'phi',
        r'\\chi': 'chi', r'\\psi': 'psi', r'\\omega': 'omega',
        # Trigonometric functions
        r'\\sin': 'sine', r'\\cos': 'cosine', r'\\tan': 'tangent',
        r'\\csc': 'cosecant', r'\\sec': 'secant', r'\\cot': 'cotangent',
        # Differential and partial differential
        r'\\partial': 'partial', r'\\nabla': 'nabla',
        r'\\mathrm\{d\}': 'd',  # For derivatives
        # Other mathematical symbols
        r'\\times': 'times', r'\\div': 'divided by', r'\\cdot': 'dot',
        # Additional symbols and operations
        r'\+': 'plus', r'\-': 'minus', r'\*': 'times',
        # Handling general exponentiation
        r'\\exp\{(.*?)\}': r'e to the power of \1',  # For exponential functions
        r'(\w+)\^(\w+)': r'\1 to the power of \2',  # General exponentiation
        # Handling \mathop
        r'\\mathop\{\\rm ([^}]+)\}': r'operator \1'    }
    
    # Function to apply replacements to a matched object
    def apply_replacements(match):
        # Extracting the matched text excluding the [math] tags
        math_text = match.group(1) # match.group(0) includes the whole match, so match.group(1) is the first capture group
        
        # Applying all replacements to the math_text
        for pattern, replacement in replacements.items():
            math_text = re.sub(pattern, replacement, math_text)
        
        # Return the transformed math_text
        return math_text

    # Use=ing re.sub with a function that applies the replacements for each [math] section
    # Pattern captures the content between [math] and [/math] tags
    pattern = r'\[math\](.*?)\[/math\]'
    clean_text = re.sub(pattern, apply_replacements, text)

    # Removing unnecessary braces and cleanup, applied globally to the whole text
    clean_text = re.sub(r'\{|\}', '', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

# Function to clean text
def clean_text(text):
    #handling LaTex expressions
    text = clean_math_text(text)
    # Lowercase conversion
    text = text.lower()
    # Removing HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Expanding contractions
    text = expand_contractions(text)
    # Removing special characters
    text = re.sub(r'\W', ' ', text)
    # Removing extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # removing stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\omsan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\omsan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omsan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omsan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
def expand_contractions(text):
    # Contractions map
    contractions_dict = {
        "don't": "do not", "doesn't": "does not", "didn't": "did not",
        }
    # Regular expression for finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    def replace(match): return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

def generate_features(clean_q1, clean_q2):
    # Calculate lengths of each question
    lengthq1 = len(clean_q1)
    lengthq2 = len(clean_q2)
    
    # Calculate word lengths of each question
    q1_wordlen = len(clean_q1.split())
    q2_wordlen = len(clean_q2.split())
    
    # Calculate common words
    q1_words = set(clean_q1.split())
    q2_words = set(clean_q2.split())
    common_words = len(q1_words.intersection(q2_words))
    
    # Calculate word difference
    word_difference = abs(q1_wordlen - q2_wordlen)
    
    return np.array([lengthq1, lengthq2, common_words, q1_wordlen, q2_wordlen, word_difference])

def vectorize_and_reduce(question1, question2):
    # Vectorize the questions
    tfidf_q1 = tfidf_vectorizer.transform([question1])
    tfidf_q2 = tfidf_vectorizer.transform([question2])
    # Reduce dimensions
    reduced_q1 = svd_model.transform(tfidf_q1)
    reduced_q2 = svd_model.transform(tfidf_q2)
    # Calculate squared differences
    squared_differences = np.square(reduced_q1 - reduced_q2).flatten()
    return squared_differences

def predict_duplicate_proba(question1, question2):
    # Clean the input questions
    clean_q1 = clean_text(question1)
    clean_q2 = clean_text(question2)
    
    # Generate all required features
    features = generate_features(clean_q1, clean_q2)
    
    # Vectorize and reduce the cleaned questions
    vector_features = vectorize_and_reduce(clean_q1, clean_q2)
    
    # Combine all features for prediction
    final_features_array = np.hstack((features, vector_features))
    
    # Define feature names for the DataFrame
    basic_feature_names = ['lengthq1', 'lengthq2', 'common_words', 'q1_wordlen', 'q2_wordlen', 'word_difference']
    svd_feature_names = [str(i) for i in range(vector_features.shape[0])]  # SVD feature names as '0', '1', '2', ...
    feature_names = basic_feature_names + svd_feature_names
    
    # Convert the final features array to a DataFrame with feature names
    final_features_df = pd.DataFrame([final_features_array], columns=feature_names)
    
    # Predict probabilities using the GradientBoosting model
    probas = gb_model.predict_proba(final_features_df)
    return probas

# Example usage
question1 = "What is the best thing that someone did for you on your birthday?"
question2 = "What is the best thing someone did for you on your birthday?"
probas = predict_duplicate_proba(question1, question2)
print("Probability of being non-duplicate: ", probas[0][0])
print("Probability of being duplicate: ", probas[0][1])


Probability of being non-duplicate:  0.3640749710596458
Probability of being duplicate:  0.6359250289403542


In [41]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Assuming predict_duplicate_proba is already defined as per previous discussions

df = pd.read_csv('questions.csv')
df = df.sample(100, random_state=42)  # set random state for reproducibility
probas = []
y_true = []
y_pred = []

for index, row in df.iterrows():
    proba = predict_duplicate_proba(row['question1'], row['question2'])[0][1]
    probas.append(proba)
    y_true.append(row['is_duplicate'])
    y_pred.append(1 if proba > 0.5 else 0)  # assuming 0.5 as the initial threshold

# Function to calculate F-beta score
def fbeta_score(precision, recall, beta=0.5):
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

# Calculate and print the precision, recall, accuracy, f1 score, and F-beta score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
fbeta = fbeta_score(precision, recall, beta=0.5)  # More weight on precision

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"F-beta Score: {fbeta}")

# Run a loop of predict proba threshold of 0.2 to 0.7 with steps of 0.05 in between and check the metrics for all
thresholds = np.arange(0.2, 0.75, 0.05)
for threshold in thresholds:
    y_pred = [1 if proba > threshold else 0 for proba in probas]
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    fbeta = fbeta_score(precision, recall, beta=0.5)  # Calculate F-beta score for each threshold

    print(f"\nThreshold: {threshold}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {accuracy}")
    print(f"F-beta Score: {fbeta}")  # Print F-beta score

Precision: 0.5652173913043478
Recall: 0.40625
Accuracy: 0.71
F1 Score: 0.4727272727272727
F-beta Score: 0.5241935483870969

Threshold: 0.2
Precision: 0.48333333333333334
Recall: 0.90625
Accuracy: 0.66
F-beta Score: 0.5330882352941178

Threshold: 0.25
Precision: 0.5490196078431373
Recall: 0.875
Accuracy: 0.73
F-beta Score: 0.5932203389830509

Threshold: 0.3
Precision: 0.5952380952380952
Recall: 0.78125
Accuracy: 0.76
F-beta Score: 0.625

Threshold: 0.35
Precision: 0.5675675675675675
Recall: 0.65625
Accuracy: 0.73
F-beta Score: 0.5833333333333334

Threshold: 0.39999999999999997
Precision: 0.5555555555555556
Recall: 0.625
Accuracy: 0.72
F-beta Score: 0.5681818181818182

Threshold: 0.44999999999999996
Precision: 0.5862068965517241
Recall: 0.53125
Accuracy: 0.73
F-beta Score: 0.5743243243243242

Threshold: 0.49999999999999994
Precision: 0.5652173913043478
Recall: 0.40625
Accuracy: 0.71
F-beta Score: 0.5241935483870969

Threshold: 0.5499999999999999
Precision: 0.5789473684210527
Recall: 0.34

| Threshold | Precision | Recall | Accuracy | F-beta Score |
|-----------|-----------|--------|----------|--------------|
| **0.3**   | 0.60      | 0.78   | 0.76     | **0.62**     |
| 0.25      | 0.55      | 0.88   | 0.73     | 0.59         |
| 0.35      | 0.57      | 0.66   | 0.73     | 0.58         |
| 0.45      | 0.59      | 0.53   | 0.73     | 0.57         |
| 0.4       | 0.56      | 0.62   | 0.72     | 0.57         |
| 0.2       | 0.48      | 0.91   | 0.66     | 0.53         |
| 0.5       | 0.57      | 0.41   | 0.71     | 0.52         |
| 0.55      | 0.58      | 0.34   | 0.71     | 0.51         |
| 0.6       | 0.60      | 0.28   | 0.71     | 0.49         |
| 0.65      | 0.70      | 0.22   | 0.72     | 0.49         |
| 0.7       | 0.80      | 0.13   | 0.71     | 0.38         |


When setting $\beta$ to 0.5 in the F-beta score formula, it emphasizes precision over recall. Specifically:

- The weight for precision is inversely proportional to the square of $\beta$.
- The weight for recall is directly proportional to the square of $\beta$.

The F-beta score formula is:

$$
F_{\beta} = (1 + \beta^2) \times \frac{{\text{precision} \times \text{recall}}}{{(\beta^2 \times \text{precision}) + \text{recall}}}
$$

With $\beta$ set to 0.5, the formula becomes:

$$
F_{0.5} = (1 + 0.5^2) \times \frac{{\text{precision} \times \text{recall}}}{{(0.5^2 \times \text{precision}) + \text{recall}}}
$$

$$
F_{0.5} = 1.25 \times \frac{{\text{precision} \times \text{recall}}}{{0.25 \times \text{precision} + \text{recall}}}
$$

This implies that precision is weighted by the factor $\frac{1}{0.5^2} = \frac{1}{0.25} = 4$ times more than recall in the calculation of the $F_{0.5}$ score. Therefore, in this case, precision is given 4 times the importance of recall. This weighting scheme is particularly useful when false positives are more costly or undesirable than false negatives.
