<a id='sec0'></a>
# Text Analysis
- Importing Data
- <a href='#sec1'>Exemplary Text Analysis for Row3</a>
- <a href='#sec2'>Write function to get gene-ish words list and mutation type table</a>
- <a href='#sec3'>Compiling the entire text-ome - testing</a>
- <a href='#sec4'>Compiling the entire text-ome - full mutation table</a>
- <a href='#sec5'>Compiling the entire text-ome - full gene-like words table</a>
- <a href='#sec6'>Compiling the entire gene-ome - full gene table (not genome)</a>
- <a href='#sec7'>Convert Mutation_Types in Class file</a>
- <a href='#sec8'>Combined All!</a>
- <a href='#sec9'>Test with Random Forest</a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

sns.set_context("paper")
%matplotlib inline

<b>Importing train_text</b>

In [2]:
class_train = pd.read_csv('train_variants')
text_train = pd.read_csv("train_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [3]:
class_train.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [4]:
text_train.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


<a id='sec1'></a>
# Exemplary Text Analysis for Row903 (<a href='#sec0'>Back To Top</a>)

In [9]:
txt1 = text_train.iloc[903, 1]

In [10]:
class_train.iloc[903, :]

ID                           903
Gene                      PDGFRA
Variation    KIF5B-PDGFRA Fusion
Class                          7
Name: 903, dtype: object

In [11]:
word_tokens = word_tokenize(txt1)
word_tokens = np.array(word_tokens)

In [12]:
print('initial leng %d' % len(word_tokens))

initial leng 6402


In [13]:
stop_words = set(stopwords.words('english'))
txt1_words = [w for w in word_tokens if not w in stop_words]
print('After removing stop words %d' % len(txt1_words))

After removing stop words 4632


In [14]:
df1 = pd.DataFrame(txt1_words)
df1.columns = ['tokens']
df1.head()

Unnamed: 0,tokens
0,We
1,identified
2,two
3,patients
4,(


In [15]:
gene_ish_pattern = r"[A-Z]{2,7}"

In [16]:
# get gene-ish words in a simple list
gene_ish_words1 = [word for word in txt1_words if re.match(gene_ish_pattern, word)]

In [17]:
len(gene_ish_words1)

445

In [18]:
gene_ish_words1

['BCR-ABL',
 'FIP1L1-PDGFRA',
 'STRN-PDGFRA',
 'ETV6-PDGFRA',
 'PDGFRA',
 'FIP1L1-PDGFRA',
 'CEL',
 'CEL',
 'PDGFRA',
 'PDGFRB',
 'FGFR1',
 'JAK2',
 'PDGFRA',
 'PDGFRB',
 'PDGFRA',
 'BCR',
 'FIP1L1',
 'KIF5B',
 'CDK5RAP2',
 'CEL',
 'CEL',
 'MPD',
 'MPD',
 'XY',
 'XY',
 'PCR',
 'RT-PCR',
 'PDGFRA',
 'PCR',
 'STRN-PDGFRA',
 'DNA',
 'STRN.Fusion.Ex6.1F',
 'PDGFRA.Fusion.Ex12.2R',
 'ETV6-PDGFRA',
 'ETV6.Fusion.Ex6.1F',
 'PDGFRA.Fusion.Ex12.2R',
 'MRD',
 'FISH',
 'FISH',
 'BAC',
 'PDGFRA',
 'ETV6',
 'BAC',
 'DNA',
 'UK',
 'BACs',
 'RP11-24O10',
 'PDGFRA',
 'RP11-434C1',
 'ETV6',
 'PCR',
 'PDGFRA',
 'BCR-ABL',
 'FIP1L1-PDGFRA',
 'RT-PCR',
 'FISH',
 'PDGFRA',
 'PDGFRA',
 'PDGFRA',
 'PCR',
 'STRN-PDGFRA',
 'PCR',
 'PCR',
 'PCR',
 'PDAI12-R4',
 'DNA',
 'PCR',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'DNA',
 'WW-like',
 'WW',
 'WD40',
 'WW',
 'STRN-PDGFRA',
 'STRN',
 'STRN',
 'NM_003162',
 'PDGFRA',
 'DNA',
 'DNA',
 'PDGFRA-STRN',
 'PCR',
 'PDGFRA',
 'FIP1L1-PDGFRA',
 'PDGFRA',
 'AG',
 'STRN',
 'ETV6-PD

# Some useful regex and lists

<b>Keep on adding words that are too common in 'commoners' list!</b>

In [82]:
commoners = ['RT', 'PCR', 'RT-PCR', 'DNA', 'cDNA', 'RNA', 'mRNA', 'protein', 'cell', 'cancer', 'CHIP', 'FISH']

<b>Keep words with '-positive' and '-negative'</b>

In [60]:
[word for word in txt1_words if '-positive' in word.lower()]

['FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive']

In [61]:
[word for word in txt1_words if '-negative' in word.lower()]

['FIP1L1-PDGFRA-negative', 'FIP1L1-PDGFRA-negative']

<b>Detect gene fusions</b>

In [84]:
fuse_pattern = r"[A-Z]{2,7}-[A-Z]{2,7}"
[word for word in txt1_words \
 if word not in commoners \
 if re.search(fuse_pattern, word)]

['BCR-ABL',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'BCR-ABL',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'PDGFRA-STRN',
 'PDGFRA-ETV6',
 'BCR-ABL',
 'BCR-ABL',
 'BCR-ABL',
 'BCR-ABL',
 'BCR-PDGFRA',
 'BCR-ABL',
 'PDGFRA-KIF5B']

# Method 1: Replace periods, commas, hyphens, brackets with a space, and then tokenize

In [71]:
tokens = word_tokenize(txt1)
tokens = np.array(tokens)
print('initial length %d' % len(tokens))

initial length 6402


In [72]:
txt1_white = txt1.encode().decode()  # copy a string?!
txt1_white = txt1_white.replace('"', ' ')
txt1_white = txt1_white.replace('.', ' ')
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = txt1_white.replace('_', ' ')    # This should be done after _pos, etc checked
txt1_white = txt1_white.replace('-', ' ')    # This should be done after -pos, etc checked
txt1_white = txt1_white.replace('=', ' ')
txt1_white = txt1_white.replace('\n', ' ')
txt1_white = txt1_white.replace('\\n', ' ')
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = re.sub(' +',' ', txt1_white) 
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = txt1_white.replace('(', ' ')
txt1_white = txt1_white.replace(')', ' ')
txt1_white = txt1_white.replace('[', ' ')
txt1_white = txt1_white.replace(']', ' ')
txt1_white = txt1_white.replace('{', ' ')
txt1_white = txt1_white.replace('}', ' ')

In [73]:
tokens_white = word_tokenize(txt1_white)
tokens_white = np.array(tokens_white)
print('initial length %d' % len(tokens_white))

initial length 6146


In [74]:
tokens_white = [word for word in tokens_white if not word in stop_words]
tokens_white = [word for word in tokens_white if not word in commoners]
print('After removing stop-words & commoners %d' % len(tokens_white))

After removing stop-words & commoners 4223


In [75]:
# Remove nucleotide sequences
nts = r"[ACTGU]{4,}"
tokens_white = [word for word in tokens_white if not re.search(nts, word)]
print('After removing DNA/RNA seqs %d' % len(tokens_white))

After removing DNA/RNA seqs 4203


This time switch to re.search t pickup pattern not just from the beginning of the words

In [76]:
gene_ish_words_white = [word for word in tokens_white if re.search(gene_ish_pattern, word)]
print('# of gene-ish words: %d' % len(gene_ish_words_white))

# of gene-ish words: 441


In [77]:
gene_ish_words_white

['BCR',
 'ABL',
 'FIP1L1',
 'PDGFRA',
 'STRN',
 'PDGFRA',
 'ETV6',
 'PDGFRA',
 'PDGFRA',
 'FIP1L1',
 'PDGFRA',
 'CEL',
 'CEL',
 'PDGFRA',
 'PDGFRB',
 'FGFR1',
 'JAK2',
 'PDGFRA',
 'PDGFRB',
 'PDGFRA',
 'BCR',
 'FIP1L1',
 'KIF5B',
 'CDK5RAP2',
 'CEL',
 'CEL',
 'MPD',
 'MPD',
 'XY',
 'XY',
 'PDGFRA',
 'STRN',
 'PDGFRA',
 'STRN',
 'PDGFRA',
 'ETV6',
 'PDGFRA',
 'ETV6',
 'PDGFRA',
 'MRD',
 'BAC',
 'PDGFRA',
 'ETV6',
 'BAC',
 'UK',
 'BACs',
 'RP11',
 'PDGFRA',
 'RP11',
 'ETV6',
 'PDGFRA',
 'BCR',
 'ABL',
 'FIP1L1',
 'PDGFRA',
 'PDGFRA',
 'PDGFRA',
 'PDGFRA',
 'STRN',
 'PDGFRA',
 'HaeIII',
 'PDAI12',
 'HaeIII',
 'STRN',
 'PDGFRA',
 'STRN',
 'PDGFRA',
 'WW',
 'WW',
 'WD40',
 'WW',
 'ΔWW',
 'STRN',
 'PDGFRA',
 'STRN',
 'STRN',
 'NM',
 'PDGFRA',
 'PDGFRA',
 'STRN',
 'PDGFRA',
 'FIP1L1',
 'PDGFRA',
 'PDGFRA',
 'AG',
 'STRN',
 'ETV6',
 'PDGFRA',
 'ETV6',
 'NM',
 'PDGFRA',
 'PDGFRA',
 'ETV6',
 'ETV6',
 'PDGFRA',
 'ETV6',
 'PDGFRA',
 'ETV6',
 'PDGFRA',
 'BACs',
 'RP11',
 'RP11',
 'ETV6',
 'PDGFRA',

In [None]:
# Do the same with pd.DF
gene_ish_words = df1[df1['tokens'].str.match(gene_ish_pattern)]
print(len(gene_ish_words))

In [None]:
gene_table = gene_ish_words.groupby('tokens').size().reset_index()
gene_table.columns = ['tokens', 'appearances']

In [None]:
gene_table.sort_values('appearances', ascending=False).head(15)

In [None]:
mutation_patterns = ['Truncation', 'Deletion', 'Promoter','Amplification', 'Epigenetic', 'Frame', 'Overexpression',
                     'Duplication', 'Insertion','Subtype', 'Fusion', 'Splice', 'Wildtype']

In [None]:
mutation_table = pd.DataFrame(index=[mutation_patterns])
mutation_table['appearances'] = 0

In [None]:
for pattern in mutation_patterns:
    appearance = len(df1[df1['tokens'].str.contains(pattern, case=False)])
    mutation_table.loc[pattern, 'appearances'] = appearance

In [None]:
mutation_table

<a id='sec2'></a>
# Write function to get gene-ish words list and mutation type table (<a href='#sec0'>Back To Top</a>)

In [None]:
def process_text1(text, print_on=False):
    '''
    Process the original text. Tokenize into words first, and then remove stop words and numbers
    
    INPUT:
    ======
    text : str
        A string containing a writing to be analyzed
    
    OUTPUT:
    =======
    words : list
        A list of tokenized words
        
    '''
    # Tokenize the text
    word_tokens = word_tokenize(text)
    
    # Remove some unwanted words (hyphen excluded), and numbers
    remove_list = ['.', ',', '(', ')', '[', ']', '=', '+', '>', '<', ':', ';', '%']
    word_tokens = [word for word in word_tokens if word not in remove_list]
    word_tokens = [word for word in word_tokens if (word.isnumeric() == False)]
    
    # Remove Stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in word_tokens if not w in stop_words]
    
    # print if print_on=True
    if print_on:
        print('Length Before removing stop words %d' % len(word_tokens))
        print('Length After removing stop words %d' % len(words))
    
    return words

In [None]:
# Check if it works
txt2 = process_text1(txt1)
txt2

In [None]:
def get_gene_like_words(tokenized_text, gene_list=None):
    '''
    Get Gene-name like words from the a list of tokenized words
    
    INPUT:
    ======
    tokenized_text : list
        A list of tokenized words
    
    OUTPUT:
    =======
    gene_like_words : list
        A list of gene name like words in the tokenized list
    '''
    gene_ish_pattern = r"[A-Z]{2,7}"
    gene_like_words = [word for word in tokenized_text if re.match(gene_ish_pattern, word)]
    
    if gene_list is not None:
        genes = gene_list
        for gene in genes:
            for i in range(len(gene_like_words)):
                if gene in gene_like_words[i]:
                    gene_like_words[i] = gene
    
    return gene_like_words

In [None]:
glike_words = get_gene_like_words(txt2)
glike_words

In [None]:
def create_mutation_words_table(tokenized_text, normed=False):
    '''
    Create table for words to describe the mutation types from a list of
    tokenized words
    
    INPUT:
    ======
    text : list
        a list of tokenized words
    
    OUTPUT:
    =======
    mutation table : a list of sets
    '''
    # List of words for mutation types
    mutation_patterns = ['truncation', 'deletion', 'promoter','amplification', 'epigenetic', 'frame', 'overexpression',
                     'duplication', 'insertion','subtype', 'fusion', 'splice', 'wildtype']
    
    appearances = []
    for pattern in mutation_patterns:
        appearance = len([word for word in tokenized_text if pattern in word.lower()])
        appearances.append(appearance)
    
    if normed == 'mutation_types':
        appearances = np.array(appearances)
        if np.sum(appearances) != 0:
            appearances = appearances / np.sum(appearances)
        table = dict(zip(mutation_patterns, appearances))
    elif normed == 'total_text':
        appearances = np.array(appearances)
        appearances = appearances / len(tokenized_text)
        table = dict(zip(mutation_patterns, appearances))
    else:
        table = dict(zip(mutation_patterns, appearances))
        table['Total'] = np.sum(appearances)
    
    return table

In [None]:
create_mutation_words_table(txt2, normed='mutation_types')

<a id='sec3'></a>
# Compiling the entire text-ome - testing (<a href='#sec0'>Back To Top</a>)

In [None]:
txt3 = text_train.iloc[150, 1]

In [None]:
class_train.iloc[150, :]

In [None]:
txt3

In [None]:
textome1 = txt1 + ' ' + txt3

In [None]:
tokens1 = process_text1(txt1)
tokens2 = process_text1(txt3)
tokens_agg = process_text1(textome1)

<b>Create Mutation Table</b>

In [None]:
mut_table1 = create_mutation_words_table(tokens1)
mut_table2 = create_mutation_words_table(tokens2)
mut_table_agg = create_mutation_words_table(tokens_agg)
mut_table = pd.DataFrame([mut_table1, mut_table2, mut_table_agg])

In [None]:
mut_table

In [None]:
mut_table1 = create_mutation_words_table(tokens1, normed=True)
mut_table2 = create_mutation_words_table(tokens2, normed=True)
mut_table = pd.DataFrame([mut_table1, mut_table2])

In [None]:
mut_table

<b>Create a sparse matrix for gene-ish words space</b>

In [None]:
genes = list(class_train['Gene'].unique())

In [None]:
glike_words1 = get_gene_like_words(tokens1, gene_list=genes)
glike_words2 = get_gene_like_words(tokens2, gene_list=genes)
glike_words_agg = get_gene_like_words(tokens_agg, gene_list=genes)

In [None]:
from collections import Counter

In [None]:
c1 = dict(Counter(glike_words1))
c2 = dict(Counter(glike_words2))

In [None]:
gene_table = pd.DataFrame()

In [None]:
gene_table = gene_table.append(c1, ignore_index=True)

In [None]:
gene_table = gene_table.append(c2, ignore_index=True)

In [None]:
gene_table

<a id='sec4'></a>
# Compiling the entire text-ome - full mutation table (<a href='#sec0'>Back To Top</a>)

In [None]:
text_train.head()

Create a whole list of dictionaries first and then convert to DF

In [None]:
%%time
mut_words_list = []
for i in range(len(text_train)):
    text = text_train.loc[i, 'Text']
    tokens = process_text1(text)
    mut_words = create_mutation_words_table(tokens, normed='mutatio_types')
    mut_words_list.append(mut_words)

In [None]:
full_mutation_table = pd.DataFrame(mut_words_list)

In [None]:
full_mutation_table

Create a DF and it's updated as new row appended

In [None]:
full_mutation_table2 = pd.DataFrame()

In [None]:
%%time
for i in range(len(text_train)):
    text = text_train.loc[i, 'Text']
    tokens = process_text1(text)
    mut_words = create_mutation_words_table(tokens, normed='mutation_types')
    full_mutation_table2 = full_mutation_table2.append(mut_words, ignore_index=True)

In [None]:
full_mutation_table2

In [None]:
full_mutation_table.equals(full_mutation_table2)

Two methods gave identical result and were equally fast. It seemed like CPU was heating up more with the latter case, I'll use the whole list method.

<a id='sec5'></a>
# Compiling the entire text-ome - full gene-like words table (<a href='#sec0'>Back To Top</a>)

In [None]:
text_train.head()

In [None]:
genes = list(class_train['Gene'].unique())

In [None]:
%%time
glike_words_list = []
for i in range(len(text_train)):
    text = text_train.loc[i, 'Text']
    tokens = process_text1(text)
    glike_words = get_gene_like_words(tokens, gene_list=genes)
    c = dict(Counter(glike_words))
    glike_words_list.append(c)

In [None]:
glike_words_table = pd.DataFrame(glike_words_list)

In [None]:
glike_words_table

<a id='sec6'></a>
# Compiling the entire gene-ome - full gene table (not genome)(<a href='#sec0'>Back To Top</a>)
- This is NOT the gene-like words from the text
- This shows which gene is annotated for each ID in the 'variants' file

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
X_gene = np.array(class_train.Gene)
X_gene_int = LabelEncoder().fit_transform(X_gene.ravel()).reshape(-1, 1)
X_gene_bin = OneHotEncoder().fit_transform(X_gene_int).toarray()

In [None]:
X_gene_int

In [None]:
full_gene_table = pd.DataFrame(X_gene_bin)

In [None]:
full_gene_table

In [None]:
full_gene_table.loc[:, 39].head(10)

<a id='sec7'></a>
# Convert Mutation_Types in Class file (<a href='#sec0'>Back To Top</a>)
- Import convert_mutation_type
- Use the label encoding to make it a sparse matrix

In [None]:
def convert_mutation_type(data):
    '''
    Convert the 'Variant' Data into mutation_type in a new column, returns the new data with a new column

    Input
    =====
    data : DataFrame
        The train or test data containing Variant information

    Output
    ======
    data : DataFrame
        'mutation_type' is added to the original data from the input
    '''
    # Copy the Variation into a new column (this could be just an empty copy with Nones)
    data['mutation_type'] = data['Variation']

    # Define regex pattern for point mutants
    point_mutation_pattern = \
        r"[ARNDCEQGHILKMFPSTWYV]{1}[0-9]{1,4}[ARNDCEQGHILKMFPSTWYV*]?$"

    # Define new mutation types
    major_types = ['Truncation', 'Point Mutation', 'Deletion', 'Promoter Mutations',
       'Amplification', 'Epigenetic', 'Frame Shift', 'Overexpression',
       'Deletion-Insertion', 'Duplication', 'Insertion',
       'Gene Subtype', 'Fusion', 'Splice', 'Copy Number Loss', 'Wildtype']

    # Convert the Variant information to mutation types
    data.loc[(data['Variation'].str.match(point_mutation_pattern)), 'mutation_type']= 'Point Mutation'
    data.loc[(data['Variation'].str.contains('missense', case=False)), 'mutation_type']= 'Point Mutation'
    data.loc[(data['Variation'].str.contains('fusion', case=False)), 'mutation_type']= 'Fusion'
    data.loc[(data['Variation'].str.contains('deletion', case=False)), 'mutation_type']= 'Deletion'
    data.loc[((data['Variation'].str.contains('del', case=False))\
            &(data['Variation'].str.contains('delins', case=False) == False)),
            'mutation_type']= 'Deletion'
    data.loc[((data['Variation'].str.contains('ins', case=False))\
            &(data['Variation'].str.contains('delins', case=False) == False)),
            'mutation_type']= 'Insertion'
    data.loc[((data['Variation'].str.contains('del', case=False))\
            &(data['Variation'].str.contains('delins', case=False))),
            'mutation_type']= 'Deletion-Insertion'
    data.loc[(data['Variation'].str.contains('dup', case=False)), 'mutation_type']= 'Duplication'
    data.loc[(data['Variation'].str.contains('trunc', case=False)), 'mutation_type']= 'Truncation'
    data.loc[(data['Variation'].str.contains('fs', case=False)), 'mutation_type']= 'Frame Shift'
    data.loc[(data['Variation'].str.contains('splice', case=False)), 'mutation_type']= 'Splice'
    data.loc[(data['Variation'].str.contains('exon', case=False)), 'mutation_type']= 'Point Mutation'
    data.loc[((data['Variation'].str.contains('EGFR', case=False))\
            |(data['Variation'].str.contains('AR', case=True))\
            |(data['Variation'].str.contains('MYC-nick', case=True))\
            |(data['Variation'].str.contains('TGFBR1', case=True))\
            |(data['Variation'].str.contains('CASP8L', case=True))),
            'mutation_type']= 'Gene Subtype'
    data.loc[((data['Variation'].str.contains('Hypermethylation', case=False))\
            |(data['Variation'].str.contains('Epigenetic', case=False))),
             'mutation_type']= 'Epigenetic'
    data.loc[(data['mutation_type'].isin(major_types) == False),
            'mutation_type']= 'Others'

    # rearrange order of columns
    if 'Class' in data.columns:
        data = data[['ID', 'Gene', 'Variation', 'mutation_type', 'Class']]
    else:
        data = data[['ID', 'Gene', 'Variation', 'mutation_type']]

    return data

In [None]:
new_table = convert_mutation_type(class_train)

In [None]:
X_mtype = np.array(new_table['mutation_type'])
X_mtype_int = LabelEncoder().fit_transform(X_mtype.ravel()).reshape(-1, 1)
X_mtype_bin = OneHotEncoder().fit_transform(X_mtype_int).toarray()

In [None]:
X_mtype_int

In [None]:
full_mtype_table = pd.DataFrame(X_mtype_bin)

In [None]:
full_mtype_table

<a id='sec8'></a>
# Combined All! (<a href='#sec0'>Back To Top</a>)

In [None]:
full_mutation_table = full_mutation_table.fillna(value=0)
glike_words_table = glike_words_table.fillna(value=0)
full_gene_table = full_gene_table.fillna(value=0)
full_mtype_table = full_mtype_table.fillna(value=0)

In [None]:
features = pd.concat([full_mutation_table, 
                      glike_words_table,
                      full_gene_table,
                      full_mtype_table],
                      axis=1)

In [None]:
features.shape

In [None]:
class_train.Class.shape

<a id='sec9'></a>
# Test with Random Forest (<a href='#sec0'>Back To Top</a>)

In [None]:
X = np.array(features).astype(float)
y = np.array(class_train.Class).astype(int).ravel()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
%%time
rfc = RandomForestClassifier(n_estimators=50, max_depth=30)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))

<a id='sec10'></a>
# Test with Simple SVM (<a href='#sec0'>Back To Top</a>)

In [None]:
from sklearn.preprocessing import scale
from sklearn.svm import LinearSVC

In [None]:
X_scale = scale(X)

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_scale, y, test_size=0.2)

In [None]:
%%time
clf = LinearSVC()
clf.fit(X_train2, y_train2)

In [None]:
y_pred2 = clf.predict(X_test2)

In [None]:
print(accuracy_score(y_test2, y_pred2))