In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the dataset
dataset_reviews = pd.read_csv('../data/raw/dataset_reviews.csv')

# Select columns rating and review_content only
dataset = dataset_reviews[['rating', 'reviewContent']].copy()

In [3]:
# Drop missing values
dataset.dropna(inplace=True)

# Check the dataset
dataset.isnull().sum()

rating           0
reviewContent    0
dtype: int64

In [4]:
# Split the dataset into train, test, serialize, vectorize and serialize again

# Splitting the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(
    dataset['reviewContent'], dataset['rating'], test_size=0.8
)

In [5]:
# Serialize the splitted dataset
x_train.to_pickle('../data/interim/x_train_1.pkl')
x_test.to_pickle('../data/interim/x_test_1.pkl')

In [6]:
# Vectorize train and test dataset
vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [7]:
# Serialize the vectorized dataset
joblib.dump(vectorizer, '../models/vectorizer_1.pkl')
joblib.dump(x_train_vec, '../data/processed/x_train_vec_1.pkl')
joblib.dump(x_test_vec, '../data/processed/x_test_vec_1.pkl')
y_train.to_pickle('../data/processed/y_train_1.pkl')
y_test.to_pickle('../data/processed/y_test_1.pkl') 