### IMPORT LIBRARIES AND DATA

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns

import random
from itertools import permutations, combinations, combinations_with_replacement
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import chi2, SelectKBest, f_classif, mutual_info_classif
from sklearn.metrics import accuracy_score, f1_score, make_scorer

from scipy.stats import spearmanr, kruskal, ks_2samp
from scipy.cluster import hierarchy
from sklearn.inspection import permutation_importance

from Bio.Seq import Seq

In [2]:
train = pd.read_csv('data\\train_values.csv', index_col= 'sequence_id')
labels = pd.read_csv('data\\train_labels.csv', index_col= 'sequence_id')

In [3]:
sparse_labels = pd.DataFrame(labels.values.argmax(axis = 1), index= labels.index)
sparse_labels.columns = ['num']

In [4]:
labels_dict = {i:j for i,j in enumerate(labels.columns)}

In [5]:
sparse_labels['cat'] = sparse_labels['num'].apply(lambda x: labels_dict[x])

In [6]:
train['target'] = sparse_labels['cat']

In [7]:
y = sparse_labels['cat'].values

### FUNCTIONS

In [8]:
def dna_sequence(data, n):

    list_alpha = 'A C G N T'.split()
    
    permutation = set(''.join(p) for p in permutations(list_alpha * n, n))
    df = pd.DataFrame(index = permutation)
    num = 0

    for dna in tqdm(data['sequence']):

        end = len(dna) - (len(dna) % n) - 1
        pro = []

        for i in range(0, end, n):
            codon = dna[i: i+n]
            pro.append(codon)

        df[num] = pd.Series(pro).value_counts()
        num += 1

    return df.T

In [11]:
n2 = dna_sequence(train, 2)

100%|███████████████████████████████████████████████████████████████████████████| 63017/63017 [06:36<00:00, 158.75it/s]


In [12]:
n3 = dna_sequence(train, 3)

100%|███████████████████████████████████████████████████████████████████████████| 63017/63017 [06:24<00:00, 163.70it/s]


In [13]:
n4 = dna_sequence(train, 4)

100%|███████████████████████████████████████████████████████████████████████████| 63017/63017 [08:09<00:00, 128.63it/s]


In [83]:
ngram_seq = pd.concat([n2, n3], axis = 1)
ngram_seq.index = train.index

In [16]:
def swap_dna(dnastring):
    table = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
        'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
        }
    protein = []
    end = len(dnastring) - (len(dnastring) %3) - 1
    for i in range(0,end,3):
        codon = dnastring[i:i+3]
        if codon in table:
            aminoacid = table[codon]
            protein.append(aminoacid)
        else:
            protein.append("X")
    return "".join(protein)

In [17]:
def top10_accuracy_scorer(estimator, X, y):
    """A custom scorer that evaluates a model on whether the correct label is in 
    the top 10 most probable predictions.

    Args:
        estimator (sklearn estimator): The sklearn model that should be evaluated.
        X (numpy array): The validation data.
        y (numpy array): The ground truth labels.

    Returns:
        float: Accuracy of the model as defined by the proportion of predictions
               in which the correct label was in the top 10. Higher is better.
    """
    # predict the probabilities across all possible labels for rows in our training set
    probas = estimator.predict_proba(X)
    
    # get the indices for top 10 predictions for each row; these are the last ten in each row
    # Note: We use argpartition, which is O(n), vs argsort, which uses the quicksort algorithm 
    # by default and is O(n^2) in the worst case. We can do this because we only need the top ten
    # partitioned, not in sorted order.
    # Documentation: https://numpy.org/doc/1.18/reference/generated/numpy.argpartition.html
    top10_idx = np.argpartition(probas, -10, axis=1)[:, -10:]
    
    # index into the classes list using the top ten indices to get the class names
    top10_preds = estimator.classes_[top10_idx]

    # check if y-true is in top 10 for each set of predictions
    mask = top10_preds == y.reshape((y.size, 1))
    
    # take the mean
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [18]:
def details(feature):
    
    columns = feature.columns
    x = np.arange(len(columns))
    percentage = list(round(100 * feature.sum(axis = 0)/ len(feature), 2))
    variation = list(round(100* feature.var()/ feature.var().sum(), 2))
    width = 0.40
    
    for column in columns:
        feature = feature[feature[column] == 0]
             
    print(f'{len(feature)} values are in none of the categories')

    
    fig, ax = plt.subplots(figsize = (20,8))
    rect1 = ax.bar(x - width/2, percentage, width, label = 'percentage')
    rect2 = ax.bar(x + width/2, variation, width, label = 'variation')
    #ax.set_grid()
    ax.set_ylabel('%')
    ax.set_title('Percentage and Variation')
    ax.set_xticks(x)
    ax.set_xticklabels(columns, rotation = 90)
    ax.legend()
    
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        
    autolabel(rect1)
    autolabel(rect2)
    
    
    plt.show()

In [19]:
def get_ngram_features(n, data, replacement = True):
    
    dna = ['A', 'C', 'G', 'N', 'T']
    
    if replacement == False:
        permutation = set(''.join(p) for p in permutations(dna , n))
    else:
        permutation = set(''.join(p) for p in permutations(dna * n, n))

    df = pd.DataFrame(index = data.index)
    
    for perm in tqdm(permutation):
        df[perm] = data['sequence'].str.count(perm)
    
    return df 

In [20]:
def make_none_column(data):
    
    new_data = data.copy()
    name = data.name

    for column in new_data.columns:
        new_data = new_data[new_data[column] == 0]
    
    new_column = f'{name}_none'
    
    data[new_column] = float(0)
    
    for idx in new_data.index:
        data.loc[idx, new_column] = float(1)
    
    return data

In [21]:
def get_ones(x): 
    if x > 1:
        x = 1
    return x

In [22]:
def multiple_columns(data):

    copy = data.copy()
    
    df = pd.DataFrame(index= data.index, dtype= 'float64')
        
    df[f'{data.name}_multi_2'] = float(0)
        
    for c in combinations(data.columns, 2):
        
        X = copy[list(c)]
        idx = X[X.sum(axis = 1) == 2].index
            
        total_cases = len(idx)
            
        if total_cases > 0:
            for j in idx:
                df.loc[j, f'{data.name}_multi_2'] = float(1)
         
    if df[f'{data.name}_multi_2'].sum() == float(0):
        df.drop(f'{data.name}_multi_2', axis = 1, inplace = True)
    
    for column in df.columns:
        index = df[df[column] > 0].index
        for z in index:
            copy.loc[z,:] = float(0)
            
    final = pd.concat([copy, df], axis = 1)
            
    return final

In [67]:
def distribution_info(data):

    df = pd.DataFrame(index= data.columns, columns= data.columns)

    combination = combinations(data.columns, 2)
    
    print(f'Starting {(len(data.columns) * (len(data.columns) - 1))/2} iterations')
    
    for i in tqdm(combination):
        df.loc[i[0], i[1]] = round(ks_2samp(data[i[0]].values, data[i[1]].values)[1], 3)

    return df

In [60]:
def similar_dist(p_value, distribution_data):
    
    names = distribution_data.columns
    data = distribution_data[distribution_data > p_value].notna()
    combination = {}
    
    for i in range(len(names)):
        row = data.values[i,:]
        
        for j in range(len(names)):
            number = data.values[i,j]
            
            if number == True:
                combination[(names[i], names[j])] = distribution_data.iloc[i,j]
        
    
    return combination

In [25]:
def remove_similar_dict(p_value, distribution_data, stats_data):
    scores_dict = similar_dist(p_value, distribution_data)
    drop_list = []
    
    for i in scores_dict.keys():
        name = stats_data.loc[list(i), 'f'].sort_values().index[0]
        drop_list.append(name)
    
    return set(drop_list)

In [57]:
def f_chi2_score(data, y, p_value = False):

    if p_value == False:
        print('Getting F-scores')
        stats_data = pd.DataFrame(f_classif(data.values, y)[0], index= data.columns, columns= ['f'])
        print('Getting Chi2-scores')
        stats_data['chi2'] = pd.DataFrame(chi2(data.values, y)[0], index = data.columns)
        
    else:
        stats_data = pd.DataFrame(f_classif(data.values, y)[1], index= data.columns, columns= ['f'])
        stats_data['chi2'] = pd.DataFrame(chi2(data.values, y)[1], index = data.columns)
    
    return stats_data

### CATEGORIES

In [27]:
bacterial_resistance = ['bacterial_resistance_ampicillin',
       'bacterial_resistance_chloramphenicol',
       'bacterial_resistance_kanamycin',
       'bacterial_resistance_spectinomycin',
       'bacterial_resistance_other']

copy_number = ['copy_number_high_copy', 'copy_number_low_copy', 'copy_number_unknown']

growth_strain = ['growth_strain_ccdb_survival', 'growth_strain_dh10b', 
       'growth_strain_neb_stable', 'growth_strain_dh5alpha',
       'growth_strain_other', 'growth_strain_stbl3', 'growth_strain_top10',
       'growth_strain_xl1_blue']

growth_temp = ['growth_temp_37', 'growth_temp_30', 'growth_temp_other']

selectable_markers = ['selectable_markers_blasticidin',
       'selectable_markers_his3', 'selectable_markers_hygromycin',
       'selectable_markers_leu2', 'selectable_markers_neomycin',
       'selectable_markers_other', 'selectable_markers_puromycin',
       'selectable_markers_trp1', 'selectable_markers_ura3',
       'selectable_markers_zeocin']

species_budding = ['species_budding_yeast', 'species_fly',
       'species_human', 'species_mouse', 'species_mustard_weed',
       'species_other', 'species_synthetic',
       'species_zebrafish', 'species_nematode', 'species_rat']

In [28]:
bacterial_resistance_df = train[bacterial_resistance]
copy_number_df = train[copy_number]
growth_strain_df = train[growth_strain]
growth_temp_df = train[growth_temp]
selectable_markers_df = train[selectable_markers]
species_budding_df = train[species_budding]

In [29]:
bacterial_resistance_df.name = 'bacterial_resistance'
copy_number_df.name = 'copy_number'
growth_strain_df.name = 'growth_strain'
growth_temp_df.name = 'growth_temp'
selectable_markers_df.name = 'selectable_markers'
species_budding_df.name = 'species_budding'

In [30]:
#selectable_markers_df = multiple_columns(selectable_markers_df)
#species_budding_df = multiple_columns(species_budding_df)
#copy_number_df = multiple_columns(copy_number_df)
bacterial_resistance_df = multiple_columns(bacterial_resistance_df)
growth_strain_df = multiple_columns(growth_strain_df)
#growth_temp_df = multiple_columns(growth_temp_df)

In [31]:
bacterial_resistance_df.name = 'bacterial_resistance'
copy_number_df.name = 'copy_number'
growth_strain_df.name = 'growth_strain'
growth_temp_df.name = 'growth_temp'
selectable_markers_df.name = 'selectable_markers'
species_budding_df.name = 'species_budding'

In [32]:
selectable_markers_df = make_none_column(selectable_markers_df)
species_budding_df = make_none_column(species_budding_df)
#copy_number_df = make_none_column(copy_number_df)
#bacterial_resistance_df = make_none_column(bacterial_resistance_df)
#growth_strain_df = make_none_column(growth_strain_df)
#growth_temp_df = make_none_column(growth_temp_df)

In [33]:
#bacterial_resistance_df.drop(['bacterial_resistance_spectinomycin'], axis = 1, inplace = True)
#growth_strain_df.drop(['growth_strain_dh10b'], axis = 1, inplace = True)
growth_temp_df.drop(['growth_temp_other'], axis = 1, inplace = True)
#selectable_markers_df.drop(['selectable_markers_his3'], axis = 1, inplace = True)
#species_budding_df.drop(['species_rat'], axis = 1, inplace = True)

In [34]:
concat = [copy_number_df, growth_strain_df, growth_temp_df,
          species_budding_df, bacterial_resistance_df, selectable_markers_df]
features = pd.concat(concat, axis = 1)

features_drop_1 = ['selectable_markers_leu2', 'selectable_markers_his3', 'selectable_markers_trp1']
features['selectable_markers_merge'] = features[features_drop_1].sum(axis = 1).apply(get_ones)
features.drop(features_drop_1, axis = 1, inplace = True)

### Data Analysis

In [None]:
details(copy_number_df)

In [None]:
mask = np.triu(np.ones_like(abs(ngram.corr()), dtype=np.bool))
plt.figure(figsize=(70,70))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(abs(ngram.corr()), cmap = cmap, square=True, mask = mask, vmax= 1, vmin= 0, annot = True);

In [None]:
spearman = pd.DataFrame(abs(spearmanr(ngram)[0]), columns= ngram.columns, index= ngram.columns)
mask = np.triu(np.ones_like(spearman, dtype=np.bool))
plt.figure(figsize=(70,70))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(spearman, cmap = cmap, square=True, mask = mask, vmax= 1, vmin= 0, annot = True);

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(spearman[spearman.where(np.triu(spearman, k =1).astype(np.bool)) >= 0.985])

### DATA EXTRACTION FROM SEQUENCE FEATURE

In [71]:
ngram_1 = get_ngram_features(1, train, True)





  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A[A[A[A



 20%|████████████████▊                                                                   | 1/5 [00:01<00:06,  1.56s/it][A[A[A[A


192it [00:22, 25.36it/s][A[A[A



 40%|█████████████████████████████████▌                                                  | 2/5 [00:07<00:08,  2.83s/it][A[A[A[A



 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:13<00:07,  3.75s/it][A[A[A[A



 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:19<00:04,  4.48s/it][A[A[A[A



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:25<00:00,  5.06s/it][A[A[A[A


In [None]:
ngram_2 = get_ngram_features(2, train, True)

ngram_2.drop(ngram_2_drop, axis = 1, inplace = True)

In [None]:
#ngram_3 = get_ngram_features(3, train, True)
ngram_3 = pd.read_csv('output\\ngram_3_variables.csv', index_col= 0)

ngram_3.drop(ngram_3_drop, axis = 1, inplace = True)

In [69]:
#ngram_4 = get_ngram_features(4, train, False)
ngram_4 = pd.read_csv('output\\ngram_4_variables.csv', index_col= 0)



3000it [02:50, 19.93it/s][A[A

ngram_4_drop = pd.read_csv('ngram_4_drop.csv', index_col= 0)

ngram_4.drop(ngram_4_drop.iloc[:,0].tolist(), axis = 1, inplace = True)

In [None]:
#ngram_5 = get_ngram_features(5, train, False)
ngram_5 = pd.read_csv('output\\ngram_5_variables.csv', index_col= 0)

ngram_5_drop = pd.read_csv('ngram_5_drop.csv', index_col= 0)

ngram_5.drop(ngram_5_drop.iloc[:,0].tolist(), axis = 1, inplace = True)

### Numerical variable analysis

In [None]:
#Dendrogram 

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 12))
corr = spearmanr(n).correlation
corr_linkage = hierarchy.ward(corr)
dendro = hierarchy.dendrogram(
    corr_linkage, labels=n.columns, ax=ax1, leaf_rotation=90
)
dendro_idx = np.arange(0, len(dendro['ivl']))

ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
ax2.set_yticklabels(dendro['ivl'])
fig.tight_layout()
plt.show()

In [51]:
n2_dist = distribution_info(n2)
n2_score = f_chi2_score(n2, y)
ngram_2_drop = remove_similar_dict(0.05, n2_dist, n2_score)

Starting 300.0 iterations


In [81]:
n2.drop(ngram_2_drop, inplace = True, axis = 1)

In [56]:
n3_dist = distribution_info(n3)
n3_score = f_chi2_score(n3, y)
ngram_3_drop = remove_similar_dict(0.05, n3_dist, n3_score)

4it [00:00, 38.45it/s]

Starting 7750.0 iterations


7750it [03:24, 37.97it/s]


In [82]:
n3.drop(ngram_3_drop, inplace = True, axis = 1)

In [None]:
#n5 = pd.read_csv('output\\n5.csv', index_col= 0)
#n5_score = pd.read_csv('output\\n5_score.csv', index_col= 0)
#ngram_5_drop = remove_similar_dict(0.05, n5, n5_score)

In [None]:
pd.DataFrame(ngram_4_drop).to_csv('ngram_4_drop.csv')

In [None]:
n2 = distribution_info(n2)
n2_score = f_chi2_score(ngram, y, False)
ngram_2_drop = remove_similar_dict(0.05, n2, n2_score)

In [None]:
n2.drop(ngram_2_drop, axis = 1, inplace = True)

In [None]:
#Correlated variables remover

spearman = pd.DataFrame(abs(spearmanr(ngram)[0]), columns= ngram.columns, index= ngram.columns)
kr = spearman.where(np.triu(spearman, k = 1).astype('Bool'))

names = spearman.columns
df = kr[kr >= 0.99].notna()
combination = {}
    
for i in range(len(names)):
    row = df.values[i,:]
        
    for j in range(len(names)):
        number = df.values[i,j]
            
        if number == True:
            combination[(names[i], names[j])] = kr.iloc[i,j]
            
drop_list = []
    
for i in combination.keys():
    name = n0_score.loc[list(i), 'f'].sort_values().index[0]
    drop_list.append(name)

In [None]:
try_drop = set(drop_list)

In [None]:
try_drop

### Train Test data

In [None]:
ngram = pd.read_csv('output//ngram_final.csv')

In [85]:
ngram = pd.concat([ngram_1, ngram_seq], axis = 1)

In [86]:
scaler = MinMaxScaler()
scaler.fit(ngram)
n_scaled = scaler.transform(ngram)

In [87]:
X = np.concatenate([features.values, n_scaled], axis = 1)
X.shape

(63017, 130)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 45)

#X_val, X_test, y_val, y_test = train_test_split(X_dev, y_dev,test_size = 0.5, random_state = 7)

In [89]:
fold = KFold(6, True, 78)

### Scoring selection

tree = ExtraTreesClassifier()
tree.fit(X_train, y_train)

chi_score_tree = pd.DataFrame(tree.feature_importances_, index= ngram.columns)

plt.bar(pd.concat([features,n]).columns, tree.feature_importances_)
plt.xticks(rotation = 90);

### Models

In [90]:
#rf = RandomForestClassifier()
knn = KNeighborsClassifier(n_neighbors= 40, weights= 'distance')
#bag = BaggingClassifier(knn, 10, bootstrap_features = True)
#nb= MultinomialNB(alpha= 0.6)

In [91]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=40, p=2,
                     weights='distance')

In [92]:
top10_accuracy_scorer(knn, X_test, y_test)

0.878133925737861

f1_scorer = make_scorer(f1_score, average = 'macro')
accuracy_scorer = make_scorer(accuracy_score) 

scoring = {'f1': f1_scorer,
          'accuracy_score': accuracy_scorer,
          'top_10': top10_accuracy_scorer}

cross_val = cross_validate(knn, X, y, cv = fold, scoring = top10_accuracy_scorer, verbose = 2)

cross_val['test_score'].mean()

cross_val['test_score'].mean()

### Testing Data

In [None]:
test = pd.read_csv('data\\test_values.csv', index_col= 'sequence_id')
submission = pd.read_csv('data\\submission_format_3TFRxH6.csv', index_col= 'sequence_id')

In [None]:
bacterial_resistance_test = test[bacterial_resistance]
copy_number_test = test[copy_number]
growth_strain_test = test[growth_strain]
growth_temp_test = test[growth_temp]
selectable_markers_test = test[selectable_markers]
species_budding_test = test[species_budding]

In [None]:
bacterial_resistance_test.name = 'bacterial_resistance'
copy_number_test.name = 'copy_number'
growth_strain_test.name = 'growth_strain'
growth_temp_test.name = 'growth_temp'
selectable_markers_test.name = 'selectable_markers'
species_budding_test.name = 'species_budding'

In [None]:
#selectable_markers_test = multiple_columns(selectable_markers_test)
#species_budding_test = multiple_columns(species_budding_test)
#copy_number_test = multiple_columns(copy_number_test)
bacterial_resistance_test = multiple_columns(bacterial_resistance_test)
#growth_strain_test = multiple_columns(growth_strain_test)
#growth_temp_test = multiple_columns(growth_temp_test)

In [None]:
bacterial_resistance_test.name = 'bacterial_resistance'
copy_number_test.name = 'copy_number'
growth_strain_test.name = 'growth_strain'
growth_temp_test.name = 'growth_temp'
selectable_markers_test.name = 'selectable_markers'
species_budding_test.name = 'species_budding'

In [None]:
selectable_markers_test = make_none_column(selectable_markers_test)
species_budding_test = make_none_column(species_budding_test)
#copy_number_test = make_none_column(copy_number_test)
#bacterial_resistance_test = make_none_column(bacterial_resistance_test)
#growth_strain_test = make_none_column(growth_strain_test)
#growth_temp_test = make_none_column(growth_temp_test)

In [None]:
growth_temp_test.drop(['growth_temp_other'], axis = 1, inplace = True)

In [None]:
concat_test = [copy_number_test, growth_strain_test, growth_temp_test,
          species_budding_test, bacterial_resistance_test, selectable_markers_test]

features_test = pd.concat(concat_test, axis = 1)

In [None]:
features_test['selectable_markers_merge'] = features_test[features_drop_1].sum(axis = 1).apply(get_ones)
features_test.drop(features_drop_1, axis = 1, inplace = True)

In [None]:
ngram_1_test = get_ngram_features(1, test, False)
#ngram_1_test.drop(ngram_1_drop, axis = 1, inplace = True)

ngram_2_test = get_ngram_features(2, test, False)
#ngram_2_test.drop(ngram_2_drop, axis = 1, inplace = True)

ngram_3_test = get_ngram_features(3, test, False)
#ngram_3_test.drop(ngram_3_drop, axis = 1, inplace = True)

ngram_4_test = get_ngram_features(4, test, False)

ngram_5_test = get_ngram_features(5, test, False)

In [None]:
n_test = pd.concat([ngram_1_test, ngram_2_test, ngram_3_test, ngram_4_test, ngram_5_test], axis = 1)

In [None]:
n_test = n_test[ngram.columns]

In [None]:
assert list(n_test.columns) == list(ngram.columns)

In [None]:
ngram_test = scaler.transform(n_test)

In [None]:
features_test = features_test[features.columns]

In [None]:
test_X = np.concatenate([features_test.values, ngram_test], axis = 1)
test_X.shape

In [None]:
assert X.shape[1] == test_X.shape[1]

In [None]:
predictions_knn = knn.predict_proba(test_X)

In [None]:
predictions_knn.shape

In [None]:
er = knn.classes_ == submission.columns

In [None]:
submission_knn = pd.DataFrame(predictions_knn, columns= knn.classes_, index= submission.index)

In [None]:
num_not_included = 1314 - predictions_knn.shape[1]
non_included_col = np.zeros((predictions_knn.shape[0], num_not_included))

test_col = []
for i in submission.columns:
    if i not in knn.classes_:
        test_col.append(i)

assert len(test_col) == num_not_included


non_include_df = pd.DataFrame(non_included_col, columns = test_col, index = submission.index)
submission_final = pd.concat([submission_knn, non_include_df], axis = 1)
submission_final = submission_final[submission.columns]

In [None]:
submission_knn.to_csv('submission_final.csv')

In [None]:
with zipfile.ZipFile('submission_new.zip', 'w', zipfile.ZIP_DEFLATED) as f:
    f.write('submission_final.csv')

In [None]:
idx = [0,2,6,7,8,9,10,11,14,18,19,20,21,23,24,25,26,27,29,30,32,33,34,37,39,41,42,43,45,46,48]

title = ['copy_number_high_copy', 'copy_number_low_copy', 'copy_number_unknown','copy_number_none', 'growth_strain_ccdb_survival',
'growth_strain_dh10b', 'growth_strain_neb_stable','growth_strain_dh5alpha', 'growth_strain_other', 'growth_strain_stbl3','growth_strain_top10', 'growth_strain_xl1_blue',
'growth_strain_multi_2', 'growth_strain_none', 'growth_temp_37','growth_temp_30', 'growth_temp_other', 'growth_temp_none',
'species_budding_yeast', 'species_fly', 'species_human','species_mouse', 'species_mustard_weed', 'species_other',
'species_synthetic', 'species_zebrafish', 'species_nematode','species_rat', 'species_budding_multi_2', 'species_budding_none',
'bacterial_resistance_ampicillin','bacterial_resistance_chloramphenicol','bacterial_resistance_kanamycin', 'bacterial_resistance_spectinomycin',
'bacterial_resistance_other', 'bacterial_resistance_multi_2','bacterial_resistance_none', 'selectable_markers_blasticidin',
'selectable_markers_his3', 'selectable_markers_hygromycin','selectable_markers_leu2', 'selectable_markers_neomycin',
'selectable_markers_other', 'selectable_markers_puromycin','selectable_markers_trp1', 'selectable_markers_ura3',
'selectable_markers_zeocin', 'selectable_markers_multi_2','selectable_markers_none']

In [None]:
features = features.iloc[:,idx]

In [None]:
inc = []
for i in idx:
    inc.append(title[i])

In [None]:
for i in features_score.index:
    if i in inc:
        features_score.loc[i, 'included'] = 'y'

In [None]:
features_score.drop('included', inplace = True, axis = 1)