In [None]:
import pandas as pd 
import numpy as np

## Librerías sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
## Importo los datos de entrenamiento y testeo para la submission.
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv').set_index("PetID")
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv').set_index("PetID")

In [None]:
## Procesamiento (Latent Semantic Analisis con SVD) del texto de descripción. Finalmente se anexa a train 
## y test los vectores que representan a texto.

train_id = train.index
test_id = test.index

train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')

# Fit TFIDF
tfv.fit(list(train_desc))
X =  tfv.transform(train_desc)
X_test = tfv.transform(test_desc)
print("X (tfidf):", X.shape)
print("X_test (tfidf):", X_test.shape)

svd = TruncatedSVD(n_components=200,
                   random_state=987)
svd.fit(X)
X = svd.transform(X)
X_test = svd.transform(X_test)
print("X (svd):", X.shape)
print("X_test (svd):", X_test.shape)

In [None]:
X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(200)]).set_index(train_id)
X.head()

In [None]:
X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(200)]).set_index(test_id)
X_test.head()

In [None]:
X.to_parquet("train_text_features.parquet")
X_test.to_parquet("test_text_features.parquet")

### Prueba con transformers -------------------------------------------------------------

In [None]:
# ! pip install -U sentence-transformers

In [None]:
## Importo los datos de entrenamiento y testeo para la submission.
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv').set_index("PetID")
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv').set_index("PetID")

## Indices de mascotas
train_id = train.index
test_id = test.index

## Descripciones para input del modelo
train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
embedding_train = model.encode(train_desc)
embedding_test = model.encode(test_desc)

In [None]:
X_train_bert = pd.DataFrame(embedding_train, columns=['SBERT_{}'.format(i) for i in range(embedding_train.shape[1])]).set_index(train_id)
X_test_bert = pd.DataFrame(embedding_test, columns=['SBERT_{}'.format(i) for i in range(embedding_test.shape[1])]).set_index(test_id)

In [None]:
X_train_bert.to_parquet("train_text_features_SBERT.parquet")
X_test_bert.to_parquet("test_text_features_SBERT.parquet")