## Data Preprocessing

In [45]:
#Import library untuk data preparation dan visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import pickle and json file for columns and model file
import pickle
import json
import joblib
import yaml

import nltk
nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
import nltk
import re
import src.util as util

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
params_dir = "config/config.yaml"

In [47]:
def load_params(param_dir):
    with open(param_dir, 'r') as file:
        params = yaml.safe_load(file)
        
    return params

In [48]:
params = load_params(params_dir)

In [49]:
params

{'raw_dataset_dir': 'dataset/1 - raw data/',
 'train_set_path': ['dataset/2 - processed/X_train.pkl',
  'dataset/2 - processed/y_train.pkl'],
 'valid_set_path': ['dataset/2 - processed/X_valid.pkl',
  'dataset/2 - processed/y_valid.pkl'],
 'test_set_path': ['dataset/2 - processed/X_test.pkl',
  'dataset/2 - processed/y_test.pkl'],
 'train_bow_set_path': ['dataset/3 - final/X_train_bow.pkl',
  'dataset/3 - final/y_train_encoded.pkl'],
 'valid_bow_set_path': ['dataset/3 - final/X_valid_bow.pkl',
  'dataset/3 - final/y_valid_encoded.pkl'],
 'test_bow_set_path': ['dataset/3 - final/X_test_bow.pkl',
  'dataset/3 - final/y_test_encoded.pkl'],
 'train_tfidf_set_path': ['dataset/3 - final/X_train_tfidf.pkl',
  'dataset/3 - final/y_train_encoded.pkl'],
 'valid_tfidf_set_path': ['dataset/3 - final/X_valid_tfidf.pkl',
  'dataset/3 - final/y_valid_encoded.pkl'],
 'test_tfidf_set_path': ['dataset/3 - final/X_test_feng.pkl',
  'dataset/3 - final/X_test_tfidf.pkl'],
 'model_wordwvec': 'model/word2vec

In [50]:
config_data = util.load_config()

## Read Data

In [51]:
X_train = util.pickle_load(config_data["train_set_path"][0])
y_train = util.pickle_load(config_data["train_set_path"][1])

X_valid = util.pickle_load(config_data["valid_set_path"][0])
y_valid = util.pickle_load(config_data["valid_set_path"][1])

X_test = util.pickle_load(config_data["test_set_path"][0])
y_test = util.pickle_load(config_data["test_set_path"][1])

In [52]:
X_train

Unnamed: 0,title
5544,Imax 84532 Tenor Large Recycled Glass Vase
3120,Darice 6202-113 10-Light 5-Point Star Tree Top...
1387,5 Piece Full Size Frozen Bedding Set Includes ...
1075,New Star Foodservice Knives (Set of 12)
13665,Mr &amp; Mrs T Bold &amp; Spicy Bloody Mary Mi...
...,...
19690,"Aviditi MD24126 Multi-Depth Corrugated Box, 24..."
15580,Perfume Studio&reg; Graduated Glass Dropper Bo...
10534,"Brother HL-L2340DW Compact Laser Printer, Mono..."
2793,Hello Kitty Wastebasket - Garbage Can


In [53]:
y_train

5544              Home & Kitchen
3120              Home & Kitchen
1387              Home & Kitchen
1075              Home & Kitchen
13665     Grocery & Gourmet Food
                  ...           
19690            Office Products
15580    Industrial & Scientific
10534            Office Products
2793              Home & Kitchen
7250              Home & Kitchen
Name: category, Length: 8052, dtype: object

In [54]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Load the stopwords from the NLTK library
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(df, column_name):
    # Remove special characters and convert to lowercase
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x).lower())
    
    # Remove stopwords and join the words with a single space
    df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
# Membersihkan teks dari karakter khusus dan mengonversi teks menjadi huruf kecil
X_train = preprocess_text(X_train, "title")

In [56]:
X_test = preprocess_text(X_test, "title")

In [57]:
X_valid = preprocess_text(X_valid, "title")

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
import joblib

# Inisialisasi CountVectorizer
count_vectorizer = CountVectorizer()

# Fitting hanya pada data pelatihan
count_vectorizer.fit(X_train['title'])

with open(config_data["count_vectorizer"], 'wb') as f:
    pickle.dump(count_vectorizer, f)
    
# count_vectorizer = joblib.load(config_data["count_vectorizer"])

# Transformasi pada data pelatihan, data uji, dan data validasi
X_train_bow = count_vectorizer.transform(X_train['title'])
X_test_bow = count_vectorizer.transform(X_test['title'])
X_valid_bow = count_vectorizer.transform(X_valid['title'])


# Kemudian, di kemudian hari, Anda dapat memuat model CountVectorizer dan menggunakannya pada data baru
# untuk melakukan transformasi seperti ini:

# Memuat model CountVectorizer
# loaded_count_vectorizer = joblib.load('count_vectorizer_model.pkl')

# Menggunakan model CountVectorizer untuk melakukan transformasi pada data baru
# new_data = ["Contoh teks baru untuk di-encode"]
# X_new_bow = loaded_count_vectorizer.transform(new_data)

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fitting hanya pada data pelatihan
tfidf_vectorizer.fit(X_train['title'])

# Menyimpan model TfidfVectorizer ke dalam file
with open(config_data["tfidf_vectorizer"], 'wb') as f:
    pickle.dump(count_vectorizer, f)
    
# tfidf_vectorizer = joblib.load(config_data["tfidf_vectorizer"])

# Transformasi pada data pelatihan, data uji, dan data validasi
X_train_tfidf = tfidf_vectorizer.transform(X_train['title'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['title'])
X_valid_tfidf = tfidf_vectorizer.transform(X_valid['title'])


    
# Kemudian, di kemudian hari, Anda dapat memuat model TfidfVectorizer dan menggunakannya pada data baru
# untuk melakukan transformasi seperti ini:
# Memuat model TfidfVectorizer
# loaded_tfidf_vectorizer = joblib.load('tfidf_vectorizer_model.pkl')
# X_new_tfidf = loaded_tfidf_vectorizer.transform(new_data)

In [60]:
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# One-Hot Encoding (BoW) pada kolom 'title'
count_vectorizer = CountVectorizer()
X_train_bow = count_vectorizer.fit_transform(X_train['title'])
X_test_bow = count_vectorizer.transform(X_test['title'])
X_valid_bow = count_vectorizer.transform(X_valid['title'])

# TF-IDF pada kolom 'title'
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['title'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['title'])
X_valid_tfidf = tfidf_vectorizer.transform(X_valid['title'])
"""

"\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n\n# One-Hot Encoding (BoW) pada kolom 'title'\ncount_vectorizer = CountVectorizer()\nX_train_bow = count_vectorizer.fit_transform(X_train['title'])\nX_test_bow = count_vectorizer.transform(X_test['title'])\nX_valid_bow = count_vectorizer.transform(X_valid['title'])\n\n# TF-IDF pada kolom 'title'\ntfidf_vectorizer = TfidfVectorizer()\nX_train_tfidf = tfidf_vectorizer.fit_transform(X_train['title'])\nX_test_tfidf = tfidf_vectorizer.transform(X_test['title'])\nX_valid_tfidf = tfidf_vectorizer.transform(X_valid['title'])\n"

In [61]:
X_train_tfidf

<8052x20797 sparse matrix of type '<class 'numpy.float64'>'
	with 79263 stored elements in Compressed Sparse Row format>

In [62]:
X_train_bow

<8052x20797 sparse matrix of type '<class 'numpy.int64'>'
	with 79263 stored elements in Compressed Sparse Row format>

In [63]:
y_train

5544              Home & Kitchen
3120              Home & Kitchen
1387              Home & Kitchen
1075              Home & Kitchen
13665     Grocery & Gourmet Food
                  ...           
19690            Office Products
15580    Industrial & Scientific
10534            Office Products
2793              Home & Kitchen
7250              Home & Kitchen
Name: category, Length: 8052, dtype: object

In [64]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
y_valid_encoded = le.transform(y_valid)

In [67]:
from sklearn.preprocessing import LabelEncoder

# Inisialisasi LabelEncoder
le = LabelEncoder()

# Fit dan transform label
y_train_encoded = le.fit_transform(y_train)

# Daftar kategori yang telah diubah ke bilangan bulat
encoded_classes = list(le.classes_)

# Mencetak hasil
print("Kategori yang diubah ke bilangan bulat:")
for kategori, kode in zip(encoded_classes, le.transform(encoded_classes)):
    print(f"{kategori} -> {kode}")


Kategori yang diubah ke bilangan bulat:
Electronics -> 0
Grocery & Gourmet Food -> 1
Home & Kitchen -> 2
Industrial & Scientific -> 3
Office Products -> 4
Tools & Home Improvement -> 5


## Save Data

In [65]:
util.pickle_dump(X_train_bow, config_data["train_bow_set_path"][0])
util.pickle_dump(y_train_encoded, config_data["train_bow_set_path"][1])
    
util.pickle_dump(X_valid_bow, config_data["valid_bow_set_path"][0])
util.pickle_dump(y_valid_encoded, config_data["valid_bow_set_path"][1])

util.pickle_dump(X_test_bow, config_data["test_bow_set_path"][0])
util.pickle_dump(y_test_encoded, config_data["test_bow_set_path"][1])

In [66]:
util.pickle_dump(X_train_tfidf, config_data["train_tfidf_set_path"][0])
util.pickle_dump(y_train_encoded, config_data["train_tfidf_set_path"][1])
    
util.pickle_dump(X_valid_tfidf, config_data["valid_tfidf_set_path"][0])
util.pickle_dump(y_valid_encoded, config_data["valid_tfidf_set_path"][1])

util.pickle_dump(X_test_tfidf, config_data["test_tfidf_set_path"][0])
util.pickle_dump(y_test_encoded, config_data["test_tfidf_set_path"][1])

In [None]:
# Inisialisasi dan fit TfidfVectorizer
def fit_tfidf_vectorizer(X_train):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(X_train['title'])
    
    # Simpan TfidfVectorizer
    with open(config_data["tfidf_vectorizer"], 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    
    return tfidf_vectorizer

# Load TfidfVectorizer dari file
def load_tfidf_vectorizer(file_path):
    with open(file_path, 'rb') as f:
        tfidf_vectorizer = pickle.load(f)
    return tfidf_vectorizer


# Inisialisasi dan latih LabelEncoder
def fit_label_encoder(y_train, save_path):
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train)
    
    # Simpan LabelEncoder ke dalam file
    with open(save_path, 'wb') as f:
        pickle.dump(label_encoder, f)
    
    return label_encoder

# Load LabelEncoder dari file
def load_label_encoder(file_path):
    with open(file_path, 'rb') as f:
        label_encoder = pickle.load(f)
    return label_encoder

# Inisialisasi dan latih LabelEncoder
label_encoder = fit_label_encoder(y_train, config_data["label_encoder"])

# Melakukan label encoding pada data pelatihan
y_train_encoded = label_encoder.transform(y_train)
# Melakukan label encoding pada data uji
y_test_encoded = label_encoder.transform(y_test)
# Melakukan label encoding pada data validasi
y_valid_encoded = label_encoder.transform(y_valid)

# Inisialisasi dan latih TfidfVectorizer
tfidf_vectorizer = fit_tfidf_vectorizer(X_train, config_data["tfidf_vectorizer"])

# Transformasi data pelatihan
X_train_tfidf = tfidf_vectorizer.transform(X_train['title'])
# Transformasi data uji
X_test_tfidf = tfidf_vectorizer.transform(X_test['title'])
# Transformasi data validasi
X_valid_tfidf = tfidf_vectorizer.transform(X_valid['title'])


# Memuat TfidfVectorizer dari file
tfidf_vectorizer = load_tfidf_vectorizer(config_data["tfidf_vectorizer"])

# Transformasi data pelatihan
X_train_tfidf = tfidf_vectorizer.transform(X_train['title'])
# Transformasi data uji
X_test_tfidf = tfidf_vectorizer.transform(X_test['title'])
# Transformasi data validasi
X_valid_tfidf = tfidf_vectorizer.transform(X_valid['title'])