In [1]:
import pandas as pd
import numpy as np
import nltk
pd.set_option("display.max_colwidth",80)

df = pd.read_csv("Questions.csv", sep=";")
def pandas_df_to_markdown_table(df):
    from IPython.display import Markdown, display
    fmt = ['---' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return Markdown(df_formatted.to_csv(sep="|", index=False))
pandas_df_to_markdown_table(df.head())

Question|Type
---|---
Is Hirschsprung disease a mendelian or a multifactorial disorder?|summary
List signaling molecules (ligands) that interact with the receptor EGFR?|list
Is the protein Papilin secreted?|yesno
Are long non coding RNAs spliced?|yesno
Is RANKL secreted from the cells?|yesno


In [2]:
df.shape

(2251, 2)

In [3]:
labels = df.Type
print(labels.head())
questions = df.Question
print(questions.head())

0    summary
1       list
2      yesno
3      yesno
4      yesno
Name: Type, dtype: object
0           Is Hirschsprung disease a mendelian or a multifactorial disorder?
1    List signaling molecules (ligands) that interact with the receptor EGFR?
2                                            Is the protein Papilin secreted?
3                                           Are long non coding RNAs spliced?
4                                           Is RANKL secreted from the cells?
Name: Question, dtype: object


In [4]:
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import PorterStemmer
import re

from string import punctuation
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
porter = PorterStemmer()
stopwords_en = set(stopwords.words('english'))


def lemmatize(p):
    if p[1][0] in {'N', 'V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]


def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords_en]
    tokens = [token for token in tokens if token not in punctuation]
    tokens = [re.sub(r'[^a-zA-Z]', "", token) for token in tokens]
    tokens = [token for token in tokens if len(token) > 1]
    tokens = [token.lower() for token in tokens]
    tokens = [lemmatize(pair) for pair in pos_tag(tokens)]
    tokens = [porter.stem(token) for token in tokens]
    return tokens

In [5]:
preprocess_text(
    "List signaling molecules ligands that interact with the receptor EGFR?"
)

['list', 'signal', 'molecul', 'ligand', 'interact', 'receptor', 'egfr']

In [6]:
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer

with StringIO('\n'.join([i for i in questions.values])) as text:
    count_vect = CountVectorizer(analyzer=preprocess_text)
    count_vect.fit_transform(text)
len(count_vect.vocabulary_)

3601

In [7]:
from operator import itemgetter
words_sorted_by_index, _ = zip(*sorted(count_vect.vocabulary_.items(), key=itemgetter(1)))
print('Vocabulary:', words_sorted_by_index[:5], "...")

Vocabulary: ('aa', 'aagena', 'abacavir', 'abatacept', 'abc') ...


In [8]:
x = count_vect.transform([i for i in questions.values]).toarray()
y, classes = pd.factorize(labels)
y.shape, x.shape

((2251,), (2251, 3601))

In [13]:
clfs_to_test = ['DTC', 'RFC', 'KNN', 'LR', 'XGB']
clfs_to_test = ["LR"]

In [39]:
from utilities import *
import pickle
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# with open('gs_list.pickle', 'wb') as handle:
#     pickle.dump(gs_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('gs_list.pickle', 'rb') as handle:
    gs_list = pickle.load(handle)
    
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3, random_state=0)
# gs_list = model_selection(clfs_to_test, x_train, x_valid, y_train, y_valid)

clf = gs_list[0][1]

In [38]:
def predict_question(question):
    x = count_vect.transform([question]).toarray()
    return classes[clf.predict(x)[0]]
        
        
for i in range(15):
    question = input()
    print("class:", predict_question(question))

Do you like to study?
class: yesno
How do you feel rright now?
class: summary
What is your name?
class: summary
List two of your favourie films.
class: list
What is the biggest country in Europe?
class: summary
Where are you?
class: factoid
How old are you?
class: summary
d
class: factoid
d
class: factoid
d
class: factoid

class: factoid
d
class: factoid
d
class: factoid
d
class: factoid
d
class: factoid
