# Notebook to add leaks into train and save it

In [1]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
pd.options.display.max_colwidth = 50
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss

# Creating base

In [2]:
new_test=pd.read_csv('..//bases/new_test_variants.csv')
new_test_texts = pd.read_csv('..//bases/new_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
new_test_final=pd.merge(new_test,new_test_texts,how="left",on="ID")

In [3]:
leaks=pd.read_csv('..//bases/s1_add_train.csv')
leaks_1=pd.DataFrame([leaks["ID"],leaks.drop("ID",axis=1).idxmax(axis=1).map(lambda x: x.lstrip('class'))])
leaks_2=leaks_1.T
leaks_2.columns=["ID","Class"]

In [4]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [5]:
train_texts = pd.read_csv('..//bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('..//bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [6]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [7]:
leaks_3=pd.merge(leaks_2,test[test.ID.isin(leaks_2.ID)])
leaks_final=pd.merge(leaks_3,test_texts[test_texts.ID.isin(leaks_3.ID)])

In [8]:
new_train = pd.concat([train,leaks_final]) #adding first stage

In [9]:
new_train.index = range(len(new_train))
new_train.ID = new_train.index

new_train_variants = new_train[['Class', 'Gene', 'ID', 'Variation']]
new_train_text = new_train[['ID', 'Text']]
new_train_variants.to_csv("..//bases/new_training_variants.csv", index=False,encoding="utf8")
new_train_text.to_csv("..//bases/new_training_text.csv", index=False,encoding="utf8")

In [10]:
new_train

Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V
5,4,CBL,5,Oncogenic mutations in the monomeric Casitas B...,V391I
6,5,CBL,6,Oncogenic mutations in the monomeric Casitas B...,V430M
7,1,CBL,7,CBL is a negative regulator of activated recep...,Deletion
8,4,CBL,8,Abstract Juvenile myelomonocytic leukemia (JM...,Y371H
9,4,CBL,9,Abstract Juvenile myelomonocytic leukemia (JM...,C384R


In [28]:
# All of the leaks are in the test set!
#len(set(leaks_final.Variation).intersection(set(new_test.Variation)) 
#len(leaks_final)
new_test

Unnamed: 0,ID,Gene,Variation
0,1,CHEK2,H371Y
1,2,AXIN2,Truncating Mutations
2,3,WNT4,E216G
3,4,SUCLA2,G118R
4,5,BRAF,T599insTT
5,6,CHEK2,E239K
6,7,CHST3,T141M
7,8,RNF6,G244D
8,9,SPAST,C448Y
9,10,AKT1,R328A


# Extra: finding the same examples of train in test

In [11]:
merge_match = new_test.merge(new_train, left_on=['Gene', 'Variation'], right_on = ['Gene', 'Variation'])
merge_match

Unnamed: 0,ID_x,Gene,Variation,Class,ID_y,Text
0,1,CHEK2,H371Y,4,3547,The incidence of breast cancer is increasing i...
1,2,AXIN2,Truncating Mutations,1,3624,An unselected series of 310 colorectal carcino...
2,5,BRAF,T599insTT,7,3625,Pilocytic astrocytoma (PA) is emerging as a tu...
3,10,AKT1,R328A,7,3477,The protein kinase v-akt murine thymoma viral ...
4,12,KIT,S709F,2,3512,Gastrointestinal stromal tumors (GIST) are cha...
5,13,BRCA2,V211I,1,3657,Although most BRCA sequence variants are clear...
6,14,ALK,F1174C,7,3662,"In the era of personalized medicine, understan..."
7,21,PTPN11,N308D,7,3440,Noonan syndrome is a developmental disorder wi...
8,24,AKT2,BCAM-AKT2 Fusion,7,3675,High-grade serous ovarian cancer (HGSC) is the...
9,26,MET,V1220I,7,3432,Point mutations emerge as one of the rate-limi...


In [30]:
class_dummy = pd.get_dummies(merge_match.Class)
class_dummy

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,1,0,0


In [44]:
submission = pd.read_csv('..//bases/stage2_sample_submission.csv')
class_probs = []

for i in range(1,10):
    class_probs.append(train[train['Class'] == i].shape[0]/3321.) ### Frequency of each class (total rows) divided by test set numbers
    
for i in range(9):
    submission['class'+str(i+1)] = 1/9



In [45]:
submission.ID = range(len(submission))
Index_leak = merge_match.ID_x - 1
submission.iloc[Index_leak, 1:] = class_dummy.values
submission.iloc[Index_leak, 1:]

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [52]:
submission.iloc[0] = 1/9
submission.ID = new_test.ID  # needs to start from 1
submission

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,1,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
1,2,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,3,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
3,4,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
4,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
5,6,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
6,7,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
7,8,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
8,9,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
9,10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000


In [54]:
submission.to_csv("leak_submission.csv", index=False)

In [59]:
len(Index_leak)

367

In [62]:
log_loss(merge_match.Class, [[1/9]*9]*367)

2.1972245773362191

# Extra: analyzing the test set

In [36]:
new_test_index = [item for item in new_test_final.index if item not in list(Index_leak)]

In [41]:
test_all = new_test_final.iloc[new_test_index]

In [55]:
test_all

Unnamed: 0,ID,Gene,Variation,Text
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
5,6,CHEK2,E239K,The nuclei that laboratories solution p53 KIT ...
6,7,CHST3,T141M,Myeloid differentiation 88 (MyD88) is the key ...
7,8,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
8,9,SPAST,C448Y,large were of activity growth this product tol...
10,11,SCN4A,V445M,Endometrial carcinoma is the most common gynec...
14,15,ERBB2,G746S,The protein-kinase family is the most frequent...
15,16,TP53,Y234S,Among the best-studied therapeutic targets in ...
16,17,RAB27A,A87P,"Introduction In recent years, a better unders..."


In [65]:
set(test_all.Text)) # I also checked, and no text appears in train.

564

In [67]:
new_train

Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V
5,4,CBL,5,Oncogenic mutations in the monomeric Casitas B...,V391I
6,5,CBL,6,Oncogenic mutations in the monomeric Casitas B...,V430M
7,1,CBL,7,CBL is a negative regulator of activated recep...,Deletion
8,4,CBL,8,Abstract Juvenile myelomonocytic leukemia (JM...,Y371H
9,4,CBL,9,Abstract Juvenile myelomonocytic leukemia (JM...,C384R


In [66]:
len(set(test_all.Gene))

167