In [8]:
import pandas as pd 
import spacy 
import json 
from pandas.io.json import json_normalize
from tqdm import tqdm 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [11]:
# Generate question data
rows = []
neg_examples = []


# question text file into csv
with open("train_5500.label.txt", "r", encoding="ISO-8859-1") as f:
    i = 0
    for l in f.readlines():
        split_line = l.split()
        rows.append({"text": " ".join(split_line[1:]), "label":1})
pos_example = pd.DataFrame(rows)
pd.DataFrame(rows).to_csv("question_dataset.csv", index = False)

# statement & question text file into csv
nlp = spacy.load("en_core_web_sm")
data = json.load(open("train-v2.0.json", "r"))
data.keys()
extracted_rows = []

for article in tqdm(data['data']):
    for paragraph in article['paragraphs']:
        for sent in nlp(paragraph['context']).sents:
            extracted_rows.append({"text": sent.text, "label": 0})
        for question in paragraph['qas']:
            extracted_rows.append({"text": question['question'], "label": 1})

# statement text file into csv            
data2 = json.load(open("dev-v2.0.json", "r"))

for article in tqdm(data2['data']):
    for paragraph in article['paragraphs']:
        for sent in nlp(paragraph['context']).sents:
            extracted_rows.append({"text": sent.text, "label": 0})

table = pd.DataFrame(extracted_rows)
table.to_csv("qa.csv")

100%|██████████| 442/442 [07:51<00:00,  1.07s/it]
100%|██████████| 35/35 [00:25<00:00,  1.39it/s]


NameError: name 'lower' is not defined

In [12]:
table = pd.DataFrame(extracted_rows)
table.to_csv("qa.csv")
table.groupby(['label']).count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,104144
1,130319


In [13]:
# make a model 
train, test = train_test_split(table, stratify = table['label'], random_state = 75)

tfidf = TfidfVectorizer()
tfidf.fit(table['text'])

X_train = tfidf.transform(train['text'])
X_test = tfidf.transform(test['text'])

print(train.groupby(['label']).count())
test.groupby(['label']).count()

        text
label       
0      78108
1      97739


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,26036
1,32580


In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# multinomial fitting 
clf = MultinomialNB()
clf.fit(X_train, train['label'])
clf.score(X_train, train['label'])

mpreds = clf.predict(X_test)
print(classification_report(test['label'], mpreds))

              precision    recall  f1-score   support

           0       0.92      0.79      0.85     26036
           1       0.85      0.95      0.89     32580

    accuracy                           0.88     58616
   macro avg       0.88      0.87      0.87     58616
weighted avg       0.88      0.88      0.87     58616



In [15]:
from sklearn.linear_model import LogisticRegression

# logistic regression fitting
lclf = LogisticRegression()
lclf.fit(X_train, train['label'])
lclf.score(X_train, train['label'])

lpreds = lclf.predict(X_test)
print(classification_report(test['label'], lpreds))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     26036
           1       0.98      0.98      0.98     32580

    accuracy                           0.98     58616
   macro avg       0.98      0.98      0.98     58616
weighted avg       0.98      0.98      0.98     58616



In [16]:
# Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier

rclf = RandomForestClassifier()
rclf.fit(X_train, train['label'])
rclf.score(X_train, train['label'])

rpreds = rclf.predict(X_test)
print(classification_report(test['label'], rpreds))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96     26036
           1       0.96      0.97      0.96     32580

    accuracy                           0.96     58616
   macro avg       0.96      0.96      0.96     58616
weighted avg       0.96      0.96      0.96     58616



In [18]:
# remove punctuation from every third entry and lower all text for all entries
import string
def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch.lower() for ch in text if ch not in exclude)

for i in tqdm(range(0, len(table), 3)):
    table['text'].iloc[i] = remove_punc(table['text'].iloc[i])

100%|██████████| 78155/78155 [23:50<00:00, 54.65it/s]


In [None]:
import swifter
import spacy 

nlp = spacy.load('en_core_web_sm')

def clean_str(text: str) -> str:
    return " ".join([tok.lemma_ for tok in nlp(text)])

for i in tqdm(range(0, len(table))):
    table['text'].iloc[i] = clean_str(table['text'].iloc[i])


 13%|█▎        | 29995/234463 [15:39<3:58:02, 14.32it/s]

In [137]:
import csv
import numpy as np
from sklearn.metrics import classification_report

# apply model to posh data
message = pd.read_csv("all_user_questions.csv")
message['message'] = message['message'].apply(lambda x: str(x)) 

m_encoded = tfidf.transform(message['message'].tolist())

# get predictions and scores
preds = lclf.predict(m_encoded)
scores = lclf.predict_proba(m_encoded)
 
message["is_question"] = preds
message['is_question_score'] = np.max(scores, axis = 1)

# get random sample and export to csv
m2 = message.sample(n=300, random_state = 2000)
m2.to_csv("sample.csv")

# get classification report
ms = pd.read_csv("sample2.csv")
print(classification_report(ms['is_question'], ms['gold_score']))

              precision    recall  f1-score   support

           0       0.96      0.83      0.89       184
           1       0.77      0.94      0.85       116

    accuracy                           0.87       300
   macro avg       0.86      0.88      0.87       300
weighted avg       0.89      0.87      0.87       300

