In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
import re

# Define a set of stopwords
stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', 
    "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 
    'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 
    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 
    'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 
    'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 
    'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 
    'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 
    'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 
    'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', 
    "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
])

# Function to preprocess text with manual stopwords list
def preprocess_text_manual_stopwords(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z]', ' ', text)
    # Tokenize the text
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stopwords]
    # Join the words back into a single string
    return ' '.join(words)



# Load the training data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocess the training data: clean, tokenize, remove stopwords, vectorize
# Assuming you have a function called preprocess_text_manual_stopwords for preprocessing as defined earlier

# Apply preprocessing to the training questions
train_data['question1_clean'] = train_data['question1'].apply(preprocess_text_manual_stopwords)
train_data['question2_clean'] = train_data['question2'].apply(preprocess_text_manual_stopwords)
train_data['questions_combined'] = train_data['question1_clean'] + " " + train_data['question2_clean']

# Apply preprocessing to the test questions
test_data['question1_clean'] = test_data['question1'].apply(preprocess_text_manual_stopwords)
test_data['question2_clean'] = test_data['question2'].apply(preprocess_text_manual_stopwords)
test_data['questions_combined'] = test_data['question1_clean'] + " " + test_data['question2_clean']

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()

# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Create a pipeline that vectorizes the text and then applies the classifier
pipeline = make_pipeline(vectorizer, nb_classifier)
pipeline.fit(train_data['questions_combined'], train_data['is_duplicate'])

# Predict the labels for the training data to check the training accuracy
y_train_pred = pipeline.predict(train_data['questions_combined'])
train_accuracy = accuracy_score(train_data['is_duplicate'], y_train_pred)
print(f"Accuracy of the Naive Bayes classifier on the training set: {train_accuracy:.4f}")
# Train the classifier on the combined questions of the training data
pipeline.fit(train_data['questions_combined'], train_data['is_duplicate'])

# Predict the labels for the combined questions of the test data
y_pred = pipeline.predict(test_data['questions_combined'])

# If the test data has true labels, calculate the accuracy
if 'is_duplicate' in test_data:
    accuracy = accuracy_score(test_data['is_duplicate'], y_pred)
    print(f"Accuracy of the Naive Bayes classifier on the test set: {accuracy:.4f}")
else:
    # If no true labels, you can create a new column with the predictions
    test_data['predicted_is_duplicate'] = y_pred
    test_data.to_csv('test_with_predictions.csv', index=False)
    print("Predictions made on the test set and saved to 'Naive_Bayes_predictions.csv'.")



Accuracy of the Naive Bayes classifier on the training set: 0.9717
Predictions made on the test set and saved to 'Naive_Bayes_predictions.csv'.
