In [25]:
# Load packages
import scipy.io
import scipy.sparse
import pandas as pd
from IPython.display import display
import numpy as np

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

# Packages for classification
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.mixture import GaussianMixture
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Packages for NLP
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD # Features from bags of word are sparse.
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer

### Training and Testing

In [26]:
# Train: Return fitted model
def train(classifier, X, y):
    test_size = 0.25
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    model = classifier.fit(X_train, y_train)
    test_pred = model.predict(X_test)
    print('Model Accuracy: {}'.format(accuracy_score(y_test, test_pred)))
    return model
# Test: Return probability matrix 
def test(classifier, X):
    return classifier.predict_proba(X)
# Cross Validation: Return accuracy score of given classifier
def CV(X, y, classifier):
    prob = cross_val_predict(classifier, X, y, cv=3, method='predict_proba', n_jobs=-1)
    pred = np.unique(y)[np.argmax(prob, axis=1)]
    print('CV Accuracy: {}'.format(accuracy_score(y, pred)))
    return None
def stem(text):
    stemmer = SnowballStemmer("english")
    tknzr = TweetTokenizer()
    list_words = tknzr.tokenize(text)
    stem_words = [stemmer.stem(w) for w in list_words if w.isdigit() == False]
    return ' '.join(stem_words)

### Try

In [27]:
# Training Set
var_train = pd.read_csv('training_variants')
txt_train = pd.read_csv('training_text', sep='\|\|', engine='python', header=None, skiprows=1, names=["ID","Text"])
training = pd.merge(var_train, txt_train, on = 'ID')

# Testing Set
var_test2 = pd.read_csv('stage2_test_variants.csv')
txt_test2 = pd.read_csv('stage2_test_text.csv',sep='\|\|', engine='python', header=None, skiprows=1, names=["ID","Text"])
testing2 = pd.merge(var_test2, txt_test2, on = 'ID')

# Stem text
n_train = training.shape[0]
n_test = testing2.shape[0]
for i in range(n_train):
    training['Text'][i] = stem(training['Text'][i])
for i in range(n_test):
    testing2['Text'][i] = stem(testing2['Text'][i])
training.to_csv('training_stemmed.csv', index = False, encoding = 'utf-8')
testing2.to_csv('testing_stemmed.csv', index = False, encoding = 'utf-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [28]:
training = pd.read_csv('training_stemmed.csv', encoding = 'utf-8')
testing2 = pd.read_csv('testing_stemmed.csv', encoding = 'utf-8')

In [29]:
# TF-IDF
tfidf = TfidfVectorizer(analyzer="word", stop_words='english', max_features = 80)
tf_idf = tfidf.fit_transform(training['Text'])
words = tfidf.get_feature_names() # Use the same dictionary as training features
tfidf_test = TfidfVectorizer(analyzer="word", stop_words='english', max_features = 80, vocabulary = words)
tf_idf_test = tfidf_test.fit_transform(testing2['Text'])

# Train Model
X = tf_idf
y = training['Class']
model = XGBClassifier()
model_train = train(model, X, y)

# Prediction
T = tf_idf_test
r = test(model_train, T)
r_ID = np.c_[testing2['ID'],r]
submit = pd.DataFrame(r_ID, columns= ['ID','class1','class2','class3','class4','class5','class6','class7','class8','class9'])
submit['ID'] = submit['ID'].astype(int)
submit.head()
submit.to_csv('submission.csv', index=False)

Model Accuracy: 0.641395908543923


In [18]:
tfidf.get_feature_names()

['activ',
 'al',
 'all',
 'also',
 'an',
 'analysi',
 'and',
 'are',
 'as',
 'assay',
 'associ',
 'at',
 'be',
 'been',
 'between',
 'bind',
 'both',
 'brca',
 'but',
 'by',
 'cancer',
 'case',
 'cell',
 'clinic',
 'data',
 'differ',
 'dna',
 'domain',
 'effect',
 'egfr',
 'et',
 'exon',
 'express',
 'fig',
 'figur',
 'for',
 'from',
 'function',
 'gene',
 'has',
 'have',
 'human',
 'identifi',
 'in',
 'includ',
 'increas',
 'indic',
 'inhibitor',
 'interact',
 'is',
 'it',
 'kinas',
 'level',
 'line',
 'may',
 'mutant',
 'mutat',
 'not',
 'observ',
 'of',
 'on',
 'one',
 'or',
 'other',
 'our',
 'patient',
 'phosphoryl',
 'protein',
 'ras',
 'report',
 'residu',
 'resist',
 'respons',
 'result',
 'sequenc',
 'show',
 'shown',
 'signal',
 'signific',
 'structur',
 'studi',
 'suggest',
 'tabl',
 'than',
 'that',
 'the',
 'these',
 'this',
 'to',
 'tumor',
 'two',
 'typ',
 'use',
 'variant',
 'was',
 'we',
 'were',
 'which',
 'wild',
 'with']