In [None]:
import spacy
import pandas as pd
from nltk.stem import WordNetLemmatizer
import datetime
from colorama import Fore
import time
import numpy as np
from sklearn.model_selection import train_test_split
import json
import nltk
import category_encoders as ce
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

start_time = time.time()
print(
Fore.WHITE
+ "Start Time: = %s:%s:%s\n"
% (
    datetime.datetime.now().hour,
    datetime.datetime.now().minute,
    datetime.datetime.now().second,
)
)

nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()

In [None]:
df = pd.read_csv("ClassificationAlgos/Features_dev1.csv", encoding="utf8")


# Declare feature vector and target variable
X = df.drop(["answer"], axis="columns")
y = df["answer"]

# Split data into separate training and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0
)

# check the shape of X_train and X_test
X_train.shape, X_test.shape

In [None]:
categorical = [col for col in X_train.columns if X_train[col].dtypes == "O"]
print(Fore.GREEN + "Categorical Variables:", categorical)
numerical = [col for col in X_train.columns if X_train[col].dtypes != "O"]
print(Fore.GREEN + "Numerical Variables:", numerical)

In [None]:
# impute missing categorical variables with most frequent value
for df2 in [X_train, X_test]:
    # df2["question"].fillna(X_train["question"].mode()[0], inplace=True)
    df2["span"].fillna(X_train["span"].mode()[0], inplace=True)
    # df2["answer"].fillna(X_train["answer"].mode()[0], inplace=True)
    df2["wh_word"].fillna(X_train["wh_word"].mode()[0], inplace=True)
    df2["syntactic_divergence"].fillna(
        X_train["syntactic_divergence"].mode()[0], inplace=True
    )
    df2["root_matching"].fillna(X_train["root_matching"].mode()[0], inplace=True)
    df2["span_TFIDF"].fillna(X_train["span_TFIDF"].mode()[0], inplace=True)
    df2["matching_word_frequency"].fillna(
        X_train["matching_word_frequency"].mode()[0], inplace=True
    )
    df2["bigram_overlap"].fillna(X_train["bigram_overlap"].mode()[0], inplace=True)
    df2["trigram_overlap"].fillna(
        X_train["trigram_overlap"].mode()[0], inplace=True
    )
    df2["span_word_frequency"].fillna(
        X_train["span_word_frequency"].mode()[0], inplace=True
    )
    df2["bigram_TFIDF"].fillna(X_train["bigram_TFIDF"].mode()[0], inplace=True)
    df2["trigram_TFIDF"].fillna(X_train["trigram_TFIDF"].mode()[0], inplace=True)
    df2["minkowski_distance"].fillna(
        X_train["minkowski_distance"].mode()[0], inplace=True
    )
    df2["manhattan_distance"].fillna(
        X_train["manhattan_distance"].mode()[0], inplace=True
    )
    df2["euclidean_distance"].fillna(
        X_train["euclidean_distance"].mode()[0], inplace=True
    )
    df2["hamming_distance"].fillna(
        X_train["hamming_distance"].mode()[0], inplace=True
    )
    df2["jaccard_distance"].fillna(
        X_train["jaccard_distance"].mode()[0], inplace=True
    )
    df2["edit_distance"].fillna(X_train["edit_distance"].mode()[0], inplace=True)
    df2["span_length"].fillna(X_train["span_length"].mode()[0], inplace=True)
    # df2["question_length"].fillna(X_train["question_length"].mode()[0], inplace=True)


In [None]:
# print(
#     Fore.YELLOW + "Check missing values in categorical variables in X_train:\n",
#     X_train[categorical].isnull().sum(),
# )
# print(
#     Fore.YELLOW + "Check missing values in categorical variables in X_train:\n",
#     X_train[categorical].isnull().sum(),
# )
# print(Fore.YELLOW + "Check missing values in X_train:\n", X_train.isnull().sum())
# print(Fore.YELLOW + "Check missing values in X_test\n", X_test.isnull().sum())


In [None]:
# encode remaining variables with one-hot encoding
encoder = ce.OneHotEncoder(
    cols=[
        "question",
        "span",
    ]
)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [None]:
# Feature Scaling
cols = X_train.columns
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
# train a Gaussian Naive Bayes classifier on the training set
# instantiate the model
gnb = GaussianNB()
# fit the model
gnb.fit(X_train, y_train)

In [None]:
# Predict the Results
y_pred = gnb.predict(X_test)
# print("Results are:", y_pred)

In [None]:
# Model Accuracy
print(
    Fore.LIGHTBLUE_EX
    + "\nModel accuracy score: {0:0.4f}%".format(
        accuracy_score(y_test, y_pred) * 100
    )
)

In [None]:
# Compare the train-set and test-set accuracy
y_pred_train = gnb.predict(X_train)
print(
    "Training-set accuracy score: {0:0.4f}%".format(
        accuracy_score(y_train, y_pred_train) * 100
    )
)

In [None]:
# print the scores on training and test set
print("\nTraining set score: {:.4f}%".format(gnb.score(X_train, y_train) * 100))
print("Test set score: {:.4f}%".format(gnb.score(X_test, y_test) * 100))


In [None]:
end_time = time.time()
print(Fore.WHITE + "\nExecution Time = ", (end_time - start_time), "seconds")
