## Import Libraries

In [None]:
!pip install py7zr

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
import xgboost

import string
import py7zr

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Loading data

In [None]:
df_train = pd.read_csv('../input/msk-redefining-cancer-treatment/training_text.zip', engine='python', sep='\|\|', skiprows=1, names=["ID", "Text"]).set_index('ID')
df_train2 = pd.read_csv('../input/msk-redefining-cancer-treatment/training_variants.zip').set_index('ID')

df_test = pd.read_csv('../input/msk-redefining-cancer-treatment/test_text.zip', engine='python', sep='\|\|', header=None, skiprows=1, names=["ID", "Text"]).set_index('ID')
df_test2 = pd.read_csv('../input/msk-redefining-cancer-treatment/test_variants.zip').set_index('ID')

df_train.head()

In [None]:
df_train2.head()

In [None]:
train = pd.merge(df_train2, df_train, how='inner', on='ID').fillna('')
test = pd.merge(df_test2, df_test, how='inner', on='ID').fillna('')
train.head()

In [None]:
train.shape

In [None]:
test.head()

In [None]:
with py7zr.SevenZipFile('../input/msk-redefining-cancer-treatment/stage2_test_text.csv.7z', mode='r') as z:
    z.extractall()
    
with py7zr.SevenZipFile('../input/msk-redefining-cancer-treatment/stage2_test_variants.csv.7z', mode='r') as z:
    z.extractall()

In [None]:
df_test = pd.read_csv('./stage2_test_text.csv', engine='python', sep='\|\|', header=None, skiprows=1, names=["ID", "Text"]).set_index('ID')
df_test2 = pd.read_csv('./stage2_test_variants.csv').set_index('ID')
test = pd.merge(df_test2, df_test, how='inner', on='ID').fillna('')
test.head()

In [None]:
test.shape

## Preprocessing

In [None]:
(train['Class'].value_counts(sort=False) / train.shape[0]).plot(kind='bar')
plt.plot()

In [None]:
string.punctuation

In [None]:
np.array(stopwords.words('english'))

In [None]:
stop_words = set(stopwords.words('english')) 

In [None]:
def preprocessing(text):
    global stop_words
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
#     word_tokens = word_tokenize(text)
    
#     return ' '.join([w for w in word_tokens if not w in stop_words])
    return text

In [None]:
train['Text'].loc[0][:1000]

In [None]:
preprocessing(train['Text'].loc[0])[:1000]

## TFIDF

In [None]:
tfidf = TfidfVectorizer(min_df=1, ngram_range=(1, 2), max_features=500)

In [None]:
text_train = tfidf.fit_transform(train['Text'].values).toarray()
text_test = tfidf.transform(test['Text'].values).toarray()

In [None]:
text_train.shape

In [None]:
train2 = pd.DataFrame(text_train, index=train.index)  
test2 = pd.DataFrame(text_test, index=test.index)

## SVD and dimensionality reduction

In [None]:
n_components = 70

svd_truncated = TruncatedSVD(n_components=n_components, n_iter=40, random_state=42)
truncated_train = pd.DataFrame(svd_truncated.fit_transform(train2))
truncated_test = pd.DataFrame(svd_truncated.transform(test2))

truncated_train.columns = truncated_test.columns = [f'component №{i}' for i in range(1, n_components + 1)]

# truncated_train.index = train.index
# truncated_test.index = test.index

truncated_train.head()

In [None]:
truncated_test.head()

## One hot encoding

In [None]:
all_data = pd.concat([train, test]).reset_index(drop=True)
all_data = pd.get_dummies(all_data, columns=['Gene', 'Variation'], drop_first=True)
all_data.drop('Text', axis=1, inplace=True)
all_data.head()

## Add features from SVD

In [None]:
train = all_data.loc[train.index]

ind = sorted(set(all_data.index) - set(train.index))
test = all_data.loc[ind]

truncated_test.index = ind

train = train.join(truncated_train)
test = test.join(truncated_test)

train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

## Train and Test

In [None]:
X = train.drop('Class', axis=1)
# predict -> (0, 8) => -1
Y = train['Class'].values - 1

X_test = test.drop('Class', axis=1)

In [None]:
# X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, Y_train = X.copy(), Y.copy()

## Xgboost

In [None]:
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'learning_rate': 1e-1,
    'max_depth': 5,
    'num_class': 9,
    'nthread': 4,
    'seed': 42
}

In [None]:
matrix = xgboost.DMatrix(X_train, label=Y_train)

xgb_grid = xgboost.cv(
    params, 
    matrix, 
    num_boost_round=400, 
    nfold=3,
    stratified=True,
    early_stopping_rounds=50,
    verbose_eval=100,
    show_stdv=True
)

In [None]:
len(xgb_grid['test-mlogloss-mean'])

In [None]:
xgb_classifier = xgboost.train(params, matrix, num_boost_round=len(xgb_grid['test-mlogloss-mean']))

In [None]:
matrix_test = xgboost.DMatrix(X_test)

In [None]:
pred = xgb_classifier.predict(matrix_test)

In [None]:
submit = pd.DataFrame(pred, columns=[f'class{i}' for i in range(1, 10)])
submit.insert(loc=0, column='ID', value=pd.merge(df_test2, df_test, how='inner', on='ID').fillna('').index)
submit

In [None]:
submit.to_csv('submission.csv', index=False)