In [1]:
import pandas as pd 
import spacy 
import json 
from pandas.io.json import json_normalize
from tqdm.auto import tqdm 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import string
import swifter
import spacy 

In [3]:
# Generate question data
extracted_rows = []

# question text file into csv
with open("train_5500.label.txt", "r", encoding="ISO-8859-1") as f:
    i = 0
    for l in f.readlines():
        split_line = l.split()
        extracted_rows.append({"text": " ".join(split_line[1:]), "label":1})

# statement & question text file into csv
nlp = spacy.load("en_core_web_sm")
data = json.load(open("train-v2.0.json", "r"))
data.keys()

# remove punctuation and part of speech 
def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch.lower() for ch in text if ch not in exclude)

def clean_str(text: str) -> str:
    return " ".join([tok.lemma_ for tok in nlp(text)])

for article in tqdm(data['data']):
    for paragraph in article['paragraphs']:
        for sent in nlp(paragraph['context']).sents:
            extracted_rows.append({"text": sent.text, "label": 0})
        for i, question in enumerate(paragraph['qas']):
            if i % 3 == 0:
                extracted_rows.append({"text": remove_punc(question['question']), "label": 1})
            else:
                extracted_rows.append({"text": question['question'], "label": 1})

# statement text file into csv            
data2 = json.load(open("dev-v2.0.json", "r"))

for article in tqdm(data2['data']):
    for paragraph in article['paragraphs']:
        for sent in nlp(paragraph['context']).sents:
            extracted_rows.append({"text": sent.text, "label": 0})

# part of speech cleaning
for i in tqdm(range(0, len(extracted_rows))):
    extracted_rows[i]['text'] = clean_str(extracted_rows[i]['text'])
    
table = pd.DataFrame(extracted_rows)
table.to_csv("qa.csv")

HBox(children=(FloatProgress(value=0.0, max=442.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=239915.0), HTML(value='')))




In [4]:
table.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,104144
1,135771


# Get test and training data

In [5]:
# make a model 
train, test = train_test_split(table, stratify = table['label'], random_state = 75)

tfidf = TfidfVectorizer()
tfidf.fit(table['text'])

X_train = tfidf.transform(train['text'])
X_test = tfidf.transform(test['text'])

print(train.groupby(['label']).count())
test.groupby(['label']).count()

         text
label        
0       78108
1      101828


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,26036
1,33943


# Naive Bayes 

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# multinomial fitting 
clf = MultinomialNB(alpha = 0.6)
clf.fit(X_train, train['label'])
clf.score(X_train, train['label'])

mpreds = clf.predict(X_test)
print(classification_report(test['label'], mpreds))

              precision    recall  f1-score   support

           0       0.92      0.76      0.83     26036
           1       0.84      0.95      0.89     33943

    accuracy                           0.87     59979
   macro avg       0.88      0.85      0.86     59979
weighted avg       0.87      0.87      0.86     59979



# Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression

# logistic regression fitting
lclf = LogisticRegression()
lclf.fit(X_train, train['label'])
lclf.score(X_train, train['label'])

lpreds = lclf.predict(X_test)
print(classification_report(test['label'], lpreds))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     26036
           1       0.98      0.98      0.98     33943

    accuracy                           0.98     59979
   macro avg       0.98      0.98      0.98     59979
weighted avg       0.98      0.98      0.98     59979



# Random Forest Classifier

In [9]:
# Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier

rclf = RandomForestClassifier()
rclf.fit(X_train, train['label'])
rclf.score(X_train, train['label'])

rpreds = rclf.predict(X_test)
print(classification_report(test['label'], rpreds))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     26036
           1       0.97      0.97      0.97     33943

    accuracy                           0.96     59979
   macro avg       0.96      0.96      0.96     59979
weighted avg       0.96      0.96      0.96     59979



In [11]:
import csv
import numpy as np
from sklearn.metrics import classification_report

# apply model to posh data
message = pd.read_csv("all_user_questions.csv")
message['message'] = message['message'].apply(lambda x: str(x)) 

m_encoded = tfidf.transform(message['message'].tolist())

# get predictions and scores
preds = lclf.predict(m_encoded)
scores = lclf.predict_proba(m_encoded)
 
message["is_question"] = preds
message['is_question_score'] = np.max(scores, axis = 1)

# get random sample and export to csv
m2 = message.sample(n=300, random_state = 2000)


In [12]:
# get classification report
m2new = m2.drop_duplicates(subset='message', keep='first')
m2new.to_csv("sample.csv")

In [15]:
# get classification report
ms = pd.read_csv("sample2.csv")
print(classification_report(ms['is_question'], ms['gold_score']))

              precision    recall  f1-score   support

           0       0.95      0.84      0.89       140
           1       0.78      0.93      0.84        82

    accuracy                           0.87       222
   macro avg       0.86      0.88      0.87       222
weighted avg       0.89      0.87      0.88       222



In [12]:
import csv
import numpy as np
from sklearn.metrics import classification_report

# apply model to posh data
message = pd.read_csv("all_user_questions.csv")
message['message'] = message['message'].apply(lambda x: str(x)) 

m_encoded = tfidf.transform(message['message'].tolist())

# get predictions and scores
preds = rclf.predict(m_encoded)
scores = rclf.predict_proba(m_encoded)
 
message["is_question"] = preds
message['is_question_score'] = np.max(scores, axis = 1)

# get random sample and export to csv
m2 = message.sample(n=300, random_state = 2000)
m2.to_csv("rsample.csv")



In [13]:
ms = pd.read_csv("rsample2.csv")
print(classification_report(ms['is_question'], ms['gold_score']))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84       162
           1       0.81      0.83      0.82       138

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300



In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np


# Create first pipeline for base without reducing features.
pipe = Pipeline([('classifier' , LogisticRegression())])

# Create param grid.
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']}]

y_train = train["label"]

# Create grid search object
gclf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=10, n_jobs=-1)

# Fit on data
best_clf = gclf.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3

In [20]:
best_clf = gclf.best_estimator_
preds = best_clf.predict(X_test)

y_test = test['label']
preds = best_clf.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     26036
           1       0.98      0.98      0.98     33943

    accuracy                           0.98     59979
   macro avg       0.98      0.98      0.98     59979
weighted avg       0.98      0.98      0.98     59979



In [21]:
import csv
import numpy as np
from sklearn.metrics import classification_report

# apply model to posh data
message = pd.read_csv("all_user_questions.csv")
message['message'] = message['message'].apply(lambda x: str(x)) 

m2_encoded = tfidf.transform(message['message'].tolist())

# get predictions and scores
preds = best_clf.predict(m2_encoded)
scores = best_clf.predict_proba(m2_encoded)
 
message["is_question"] = preds
message['is_question_score'] = np.max(scores, axis = 1)

# get random sample and export to csv
m3 = message.sample(n=300, random_state = 2000)
m3new = m3.drop_duplicates(subset='message', keep='first')
m3new.to_csv("sample3.csv")

In [22]:
ms = pd.read_csv("sample3.csv")
print(classification_report(ms['is_question'], ms['gold_score']))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       140
           1       0.78      0.91      0.84        82

    accuracy                           0.87       222
   macro avg       0.86      0.88      0.87       222
weighted avg       0.88      0.87      0.88       222



# Productionalize Model

In [None]:
from typing import List
import pickle

class QuestionClassifier:
    def __init__(self, model_path: str = None,
               train_mode = True):
        self.model = None
        self.text_featurizer = None
        self.label_encoder = None

        if model_path:
          self.load(model_path)

        self.train_mode = train_mode

    def load(self, model_path: str) -> bool:
        print("loading :", model_path)
        loaded_model = pickle.load(open(model_path, "rb"))
        self.model = loaded_model["model"]
        self.text_featurizer = loaded_model["text_featurizer"]
        self.train_mode = False
        return True

    def save(self, model_path: str, model_name: str) -> str:
        model_properties = {"model": self.model,
                            "text_featurizer": self.text_featurizer}
        filename = model_path+model_name+".pkl"
        pickle.dump(model_properties, open(filename,"wb"))
        return filename

    def remove_punc(self, text):
        exclude = set(string.punctuation)
        return ''.join(ch.lower() for ch in text if ch not in exclude)

    def clean_str(self, text: str) -> str:
        return " ".join([tok.lemma_ for tok in nlp(text)])

    def train(self, X: List[str], y: List[int] ) -> float:
        """" Take in X,y. Fit model to data and return
             the model's train accuracy """
        # Preprocess texts by removing punc and lemmatizing

        X = [clean_str(remove_punc(i)) for i in X]

        # 1. Clean X and convert features
        self.text_featurizer = TfidfVectorizer()
        self.text_featurizer.fit(X)

        X_train = self.text_featurizer.transform(X)

        # 2. Remove Punctuation & Part of Speech cleaning
        self.remove_punc = remove_punc(X)
        self.clean = clean_str(y)

        y_train = self.clean.transform(y)

        # 3. Load and fit model to X,y 
        self.model  = LogisticRegression()
        self.model.fit(X_train, y_train)

        # 4. Calcuate train accuracy
        train_acc = self.model.score(X_train, y_train)
        return train_acc
  
    def predict(self, inputs: List[str]) -> List[str]:
        """ Take in a list of string inputs and output
            a list of the model's predictions """

        # 1. Convert inputs into features
        input_feats = self.text_featurizer.transform(inputs)

        # 2. Run model on features and get predictions
        preds = self.model.predict(input_feats)

        # 3. Convert predictions into clean labels to return
        pred_labels = self.label_encoder.inverse_transform(preds)

        return pred_labels
