In [230]:
import pandas as pd
import numpy as np
pd.set_option("display.max_colwidth",80)

df = pd.read_csv("Questions.csv", sep=";")
def pandas_df_to_markdown_table(df):
    from IPython.display import Markdown, display
    fmt = ['---' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    return Markdown(df_formatted.to_csv(sep="|", index=False))
pandas_df_to_markdown_table(df.head())

Question|Type
---|---
Is Hirschsprung disease a mendelian or a multifactorial disorder?|summary
List signaling molecules (ligands) that interact with the receptor EGFR?|list
Is the protein Papilin secreted?|yesno
Are long non coding RNAs spliced?|yesno
Is RANKL secreted from the cells?|yesno


In [235]:
df.shape

(2251, 2)

In [121]:
labels = df.Type
print(labels.head())
questions = df.Question
print(questions.head())

0    summary
1       list
2      yesno
3      yesno
4      yesno
Name: Type, dtype: object
0    Is Hirschsprung disease a mendelian or a multi...
1    List signaling molecules (ligands) that intera...
2                     Is the protein Papilin secreted?
3                    Are long non coding RNAs spliced?
4                    Is RANKL secreted from the cells?
Name: Question, dtype: object


In [250]:
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import PorterStemmer
import re

from string import punctuation
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
porter = PorterStemmer()
stopwords_en = set(stopwords.words('english'))


def lemmatize(p):
    if p[1][0] in {'N', 'V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]


def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords_en]
    tokens = [token for token in tokens if token not in punctuation]
    tokens = [re.sub(r'[^a-zA-Z]', "", token) for token in tokens]
    tokens = [token for token in tokens if len(token) > 1]
    tokens = [token.lower() for token in tokens]
    tokens = [lemmatize(pair) for pair in pos_tag(tokens)]
    tokens = [porter.stem(token) for token in tokens]
    return tokens

In [251]:
preprocess_text(
    "List signaling molecules ligands that interact with the receptor EGFR?"
)

['list', 'signal', 'molecul', 'ligand', 'interact', 'receptor', 'egfr']

In [252]:
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer

with StringIO('\n'.join([i for i in questions.values])) as text:
    count_vect = CountVectorizer(analyzer=preprocess_text)
    count_vect.fit_transform(text)
len(count_vect.vocabulary_)

3601

In [253]:
from operator import itemgetter
words_sorted_by_index, _ = zip(
    *sorted(count_vect.vocabulary_.items(), key=itemgetter(1)))
print('Vocabulary:', words_sorted_by_index[:5], "...")
print('Encoded Matrix:\n',
      count_vect.transform([i for i in questions.values]).toarray().shape)

Vocabulary: ('aa', 'aagena', 'abacavir', 'abatacept', 'abc') ...
Encoded Matrix:
 (2251, 3601)


In [254]:
from sklearn import linear_model

classifier = linear_model.LogisticRegression(multi_class='multinomial',
                                             solver='lbfgs')
classifier.fit(np.array(X[:800]).reshape(-1, 1), np.array(Y[:800]))

pred_train = [classifier.predict(np.array(X[:800]).reshape(-1, 1))]
pred_test = [classifier.predict(np.array(X).reshape(-1, 1))]

print("Confidence:")
print("Train accuracy " + str(confidence(pred_train, Y[:800])))
print("Test accuracy " + str(confidence(pred_test, Y)))

NameError: name 'X' is not defined