In [2]:
import os

os.chdir("../")
%pwd

'/home/siddhu/Desktop/Movie-Recommendation-System'

In [10]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from dataclasses import dataclass
from pathlib import Path
from src.movieRecommendation.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.movieRecommendation.utils.common import read_yaml, create_directories

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/siddhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
@dataclass
class DataPreparationConfig:
    root_dir: Path
    data_path: Path

In [12]:
class ConfigurationManager:
    def __init__(
        self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_preparation_config(self) -> DataPreparationConfig:
        config = self.config.data_preparation
        create_directories([config.root_dir])
        data_preparation_config = DataPreparationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )
        return data_preparation_config

In [13]:
class DataPreparation:
    
    def __init__(self, config: DataPreparationConfig):
        self.config = config
        self.lemmatizer = WordNetLemmatizer()
        
    def make_lower_case(self, text):
        text_lower = None
        text_lower = text.lower()
        return text_lower

    def remove_stop_words(self, text):
        text = text.split()
        stop_words = set(stopwords.words("english"))
        removed_stop_word_text = None
        filtered_words = [word for word in text if word not in stop_words]
        removed_stop_word_text = " ".join(filtered_words)
        return removed_stop_word_text

    def remove_numbers(self, text):
        pattern = r"[0-9]"
        removed_numbers_text = re.sub(pattern, "", text)
        return removed_numbers_text

    def remove_punctuation(self, text):
        tokenizer = RegexpTokenizer(r"[\w-]+")
        tokens = tokenizer.tokenize(text)
        removed_punctuation_text = " ".join(tokens)
        return removed_punctuation_text

    
    def lemmatize_text(self, text):
        tokens = word_tokenize(text)
        lemmatized = [self.lemmatizer.lemmatize(token.lower()) for token in tokens]
        return " ".join(lemmatized)
    
    def prepare(self):
        csv_path = os.path.join(self.config.data_path, "transformed.csv")
        df = pd.read_csv(csv_path)
        df_cleaned = df.copy()
        df_cleaned["cleaned_description"] = (
            df["concat_description"]
            .apply(self.make_lower_case)
            .apply(self.remove_punctuation)
            .apply(self.remove_numbers)
            .apply(self.lemmatize_text)
            .apply(self.remove_stop_words)
        )
        df_cleaned.to_csv(os.path.join(self.config.root_dir, "prepared.csv"), index=False)

In [14]:
config = ConfigurationManager()
data_preparation_config = config.get_data_preparation_config()
data_preparation = DataPreparation(config=data_preparation_config)
data_preparation.prepare()

[2026-02-14 14:57:40,953: INFO: common: YAML file 'config/config.yaml' read successfully.]
[2026-02-14 14:57:40,954: INFO: common: YAML file 'params.yaml' read successfully.]
[2026-02-14 14:57:40,955: INFO: common: Directory 'artifacts' created successfully or already exists.]
[2026-02-14 14:57:40,957: INFO: common: Directory 'artifacts/data_preparation' created successfully or already exists.]
