In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re

In [2]:
train = pd.read_csv('../data/train.txt', sep='\t')
valid = pd.read_csv('../data/valid.txt', sep='\t')
test = pd.read_csv('../data/test.txt', sep='\t')

In [3]:
#preprocessing 1.punctuations > space 2.number > 0 3.lower()

def preprocessing(text):
    table = str.maketrans(string.punctuation, " " * len(string.punctuation))
    text = text.translate(table)
    text = re.sub(r'[0-9]+', '0', text)
    text = text.lower()
    
    return text

In [4]:
train['Title'] = train['Title'].apply(lambda x : preprocessing(x))
valid['Title'] = valid['Title'].apply(lambda x : preprocessing(x))
test['Title'] = test['Title'].apply(lambda x : preprocessing(x))

In [5]:
vectorizer = TfidfVectorizer(min_df = 10)
X_train = vectorizer.fit_transform(train.Title.values)
X_valid = vectorizer.transform(valid.Title.values)
X_test  = vectorizer.transform(test.Title.values)

vocab = vectorizer.get_feature_names_out()

X_train = pd.DataFrame(X_train.toarray(), columns=vocab)
X_valid = pd.DataFrame(X_valid.toarray(), columns=vocab)
X_test  = pd.DataFrame(X_test.toarray(), columns=vocab)

In [6]:
X_train.to_csv('../data/train_vec.txt', sep='\t')
X_valid.to_csv('../data/valid_vec.txt', sep='\t')
X_test.to_csv('../data/test_vec.txt', sep='\t')