In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 

from collections import Counter

**Functions**

In [13]:
def classes_plot(targets, title):
    class_count = targets.value_counts()
    sns.barplot(class_count.index, class_count.values)
    plt.title(title)
    plt.ylabel('Number of Occurrences')
    plt.xlabel('Class', fontsize=12)

In [None]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))

In [None]:
def confusion_matrix_plot(class_labels, range1=6, range2=6):
    df_cm = pd.DataFrame(cm, range(range1), range(range2))
    plt.figure(figsize = (12,10))
    sns.set(font_scale=1.4)#for label size
    sns.heatmap(df_cm, annot=True, annot_kws={"size": 12}, fmt='g', xticklabels = class_labels, yticklabels = class_labels)
    plt.xlabel("Predicted Class")
    plt.ylabel("True Class")
    plt.title("Confusion Matrix")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
def gini_plot(class_name, words, values):
    plt.barh(words, values)
    plt.title('Gini Impurity ' + class_name)

In [None]:
def train_split_for_class_compare(class1, class2):
    find_list = [class1, class2]
    idx_list = []
    new_y_train = []
    for idx, num in enumerate(y_train_list):
      if num in find_list:
        idx_list.append(idx)
        new_y_train.append(num)

    new_X_train = []
    for idx, char in enumerate(X_train):
      if idx in idx_list:
        new_X_train.append(char)
        
    return new_X_train, new_y_train

In [None]:
def test_split_for_class_compare(class1, class2):
    find_list = [class1, class2]
    idx_list = []
    new_y_test = []
    for idx, num in enumerate(y_test_list):
      if num in find_list:
        idx_list.append(idx)
        new_y_test.append(num)

    new_X_test = []
    for idx, char in enumerate(X_test):
      if idx in idx_list:
        new_X_test.append(char)
        
    return new_X_test, new_y_test

In [None]:
def split_vectorize(text, target):
    X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=.3, random_state=88, stratify=target)
    vectorizer = CountVectorizer(min_df=.1, max_df=.9, stop_words=stops)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test, vectorizer 

In [None]:
def rf_feature_importance():
    cols = np.array(vectorizer.get_feature_names())
    sorted_idx = rf.feature_importances_.argsort()[-10:]
    words = cols[sorted_idx]
    values = rf.feature_importances_[sorted_idx]
    return words, values

**Useful lists**

In [None]:
# custom stop words
stops = ['figure', 'fig', 'al', 'et', 
         '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
         '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
         '2010', '2011', '2012', '2013', '2014', '2017',
         '1a', '1b', '1c', '1d', '1e', '1f',
         '2a', '2b', '2c', '2d', '2e', '2f',
         '3a', '3b', '3c', '3d', '3e', '3f',
         '4a', '4b', '4c', '4d', '4e', '4f',
         '5a', '5b', '5c', '5d',
         '6a', '6b', '6c', '6d',
         '7a', '7b', '7c',
         'wang', 'zhang']

In [None]:
# classes lists
classes_9 = ['Likely LOF', 'Likely GOF', 'Neutral', 'LOF', 'Likely Neutral', 'Inconclusive', 'GOF', 'Likely COF', 'COF']
classes_7 = ['Likely LOF', 'Likely GOF', 'Neutral', 'LOF', 'Likely Neutral', 'Inconclusive', 'GOF']
classes_6 = ['Likely LOF', 'Likely GOF', 'LOF', 'Likely Neutral', 'Inconclusive', 'GOF']

**Read in target data**

In [2]:
data = pd.read_csv("data/training_variants.csv",index_col='ID')

In [15]:
data.head()

Unnamed: 0_level_0,Gene,Variation,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,FAM58A,Truncating Mutations,1
1,CBL,W802*,2
2,CBL,Q249E,2
3,CBL,N454D,3
4,CBL,L399V,4


In [4]:
print(data.tail()

Unnamed: 0_level_0,Gene,Variation,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3316,RUNX1,D171N,4
3317,RUNX1,A122*,1
3318,RUNX1,Fusions,1
3319,RUNX1,R80C,4
3320,RUNX1,K83E,4


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3321 entries, 0 to 3320
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Gene       3321 non-null   object
 1   Variation  3321 non-null   object
 2   Class      3321 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 103.8+ KB


**Read in text data**

In [7]:
text = pd.read_csv('data/training_text.csv', sep = '\|\|', engine = 'python', names = ['ID','Text'], skiprows = 1, index_col = 'ID')

In [8]:
text.head()

Unnamed: 0_level_0,Text
ID,Unnamed: 1_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...
1,Abstract Background Non-small cell lung canc...
2,Abstract Background Non-small cell lung canc...
3,Recent evidence has demonstrated that acquired...
4,Oncogenic mutations in the monomeric Casitas B...


In [9]:
text.tail()

Unnamed: 0_level_0,Text
ID,Unnamed: 1_level_1
3316,Introduction Myelodysplastic syndromes (MDS) ...
3317,Introduction Myelodysplastic syndromes (MDS) ...
3318,The Runt-related transcription factor 1 gene (...
3319,The RUNX1/AML1 gene is the most frequent targe...
3320,The most frequent mutations associated with le...


In [10]:
text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3321 entries, 0 to 3320
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    3316 non-null   object
dtypes: object(1)
memory usage: 51.9+ KB


**Pre-process data**

In [12]:
# combine data sets
merged = data.merge(text, how='left', on='ID')

# drop where text is missing
merged = merged.dropna(how='any', subset=['Text'])

# drop classes 3, 8 & 9 - per Dan and Juliana
merged = merged[merged.Class != 8]
merged = merged[merged.Class != 9]
merged = merged[merged.Class != 3]

# add dummy variables to data frame for each existing class
for i in range(1,10):
    class_name = 'Class' + '_' + str(i) 
    merged[class_name] = np.where(merged['Class'] == i, 1, 0)

drop if all values in a column are 0 (classes 3,8,9)
merged = merged.loc[:, (merged != 0).any(axis=0)]    

In [16]:
merged.head()

Unnamed: 0_level_0,Gene,Variation,Class,Text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [17]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3321 entries, 0 to 3320
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Gene       3321 non-null   object
 1   Variation  3321 non-null   object
 2   Class      3321 non-null   int64 
 3   Text       3316 non-null   object
dtypes: int64(1), object(3)
memory usage: 129.7+ KB


In [18]:
# put the text variable into a list
text_list = merged.Text.tolist()
len(text_list)

3321

Create target variables

In [19]:
# overall target variable
y = merged['Class']
type(y)

pandas.core.series.Series

In [20]:
y_1 = merged['Class_1']
y_2 = merged['Class_2']
y_4 = merged['Class_4']
y_5 = merged['Class_5']
y_6 = merged['Class_6']
y_7 = merged['Class_7']

KeyError: 'Class_1'