In [None]:
from transformers import BertTokenizer
import pandas as pd


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_text(train_texts, test_texts):
    train_encodings = tokenizer(
        train_texts.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

    test_encodings = tokenizer(
        test_texts.tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

    return train_encodings, test_encodings


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def load_data(file_path: str) -> pd.DataFrame:
    return pd.read_csv(file_path)

def basic_cleaning(data: pd.DataFrame) -> pd.DataFrame:
    data["review"] = data["review"].str.lower()
    data["review"] = data["review"].str.replace(r"<.*?>", "", regex=True)
    data["review"] = data["review"].str.replace(r"http\S+", "", regex=True)
    data["review"] = data["review"].str.replace(r"\s+", " ", regex=True).str.strip()
    return data

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    return basic_cleaning(data)

def vectorize_data(X_train, X_test):
    vectorizer = TfidfVectorizer(
        max_features=20000,
        min_df=5,
        max_df=0.8
    )

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    return X_train_vec, X_test_vec


def encode_data(y_train, y_test):
    encoder = LabelEncoder()

    y_train_enc = encoder.fit_transform(y_train)
    y_test_enc = encoder.transform(y_test)

    return y_train_enc, y_test_enc


In [None]:
from src.preprocessing import (
    load_data,
    preprocess_data,
    vectorize_data,
    encode_data
)
from src.train import tokenize_text
from sklearn.model_selection import train_test_split

def main():
    file_path = "data/IMDB_Dataset.csv"
    data = load_data(file_path)
    data = preprocess_data(data)

    X = data["review"]
    y = data["sentiment"]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.20,
        random_state=42,
        shuffle=True,
        stratify=y
    )

    X_train_vec, X_test_vec = vectorize_data(X_train, X_test)

    y_train_enc, y_test_enc = encode_data(y_train, y_test)


    train_tokens, test_tokens = tokenize_text(X_train,X_test)

if __name__ == "__main__":
    main()
