In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
df = pd.read_csv("../Datasets/final_dataset_cleaned.csv")

X = df["text"]
y = df["label"]

print("Dataset size:", df.shape)
print("Class distribution:\n", y.value_counts())


Dataset size: (39906, 2)
Class distribution:
 label
1    19995
0    19911
Name: count, dtype: int64


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size :", X_test.shape)


Train size: (31924,)
Test size : (7982,)


In [5]:
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8,
    sublinear_tf=True
)


In [6]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

print("TF-IDF train shape:", X_train_tfidf.shape)
print("TF-IDF test shape :", X_test_tfidf.shape)


TF-IDF train shape: (31924, 10000)
TF-IDF test shape : (7982, 10000)


In [7]:
joblib.dump(tfidf, "../Models/tfidf_vectorizer.pkl")

['../Models/tfidf_vectorizer.pkl']

In [9]:
from scipy.sparse import save_npz
import numpy as np

save_npz("../Models/Features/X_train_tfidf.npz", X_train_tfidf)
save_npz("../Models/Features/X_test_tfidf.npz", X_test_tfidf)

np.save("../Models/Features/y_train.npy", y_train.values)
np.save("../Models/Features/y_test.npy", y_test.values)
