In [1]:
import os

os.chdir("../")
%pwd

'/home/siddhu/Desktop/Movie-Recommendation-System'

In [2]:
import pandas as pd
from dataclasses import dataclass
from pathlib import Path
from src.movieRecommendation.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.movieRecommendation.utils.common import read_yaml, create_directories

In [3]:
@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [17]:
class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH, params_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )
        return data_transformation_config

In [20]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.columns_to_drop = [
                "positive_users",
                "positive_count",
                "negative_users",
                "negative_count",
                "vote_average",
                "vote_count",
                "status",
                "release_date",
                "revenue",
                "runtime",
                "budget",
                "poster_path",
                "movieId",
                "imdbId",
                "tmdb_id",
                "imdb_id",
                "adult",
                "tmdbId",
            ]
        
    def drop_columns(self, df):
        df.drop(columns=self.columns_to_drop, inplace=True)
        return df
        
    def clean_genres(self, x):
        import ast
        try:
            # Convert string representation of list to actual list
            if isinstance(x, str):
                x = ast.literal_eval(x)
            # Join list elements into comma-separated string
            return ", ".join([str(i).strip() for i in x])
        except:  # noqa: E722
            return str(x)
        
    def concat_features(self, df):
        df["concat_description"] = None
        df["concat_description"] = (
            df["overview"].astype(str)
            + " "
            + df["keywords"].astype(str)
            + " "
            + df["genres"].astype(str)
            + " "
            + df["production_companies"].astype(str)
            + " "
            + df["original_language"].astype(str)
            + " "
        )
        df.drop(
            columns=[
                "overview",
                "production_companies",
                "original_language",
                "keywords",
            ],
            inplace=True,
        )
        return df
    
    def weight_description(self, row, genre_weight=3):
        genres_list = [g.strip() for g in row["genres"].split(",")]
        genres_weighted = " ".join(genres_list * genre_weight)
        return row["concat_description"] + " " + genres_weighted
    
    def transform(self):
        csv_path = os.path.join(self.config.data_path, "final.csv")
        df = pd.read_csv(csv_path)
        df = self.drop_columns(df)
        df["genres"] = df["genres"].apply(self.clean_genres)
        df["production_companies"] = df["production_companies"].apply(
            lambda x: ", ".join([c.replace(" ", "") for c in x.split(",")])
        )
        df = self.concat_features(df)
        df["concat_description"] = df.apply(
            self.weight_description, axis=1
        )
        df.drop(columns=['genres'], inplace=True)
        df.to_csv(os.path.join(self.config.root_dir, "transformed.csv"), index=False)

In [21]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = DataTransformation(config=data_transformation_config)
data_transformation.transform()

[2026-02-16 09:19:40,550: INFO: common: YAML file 'config/config.yaml' read successfully.]
[2026-02-16 09:19:40,551: INFO: common: YAML file 'params.yaml' read successfully.]
[2026-02-16 09:19:40,551: INFO: common: Directory 'artifacts' created successfully or already exists.]
[2026-02-16 09:19:40,552: INFO: common: Directory 'artifacts/data_transformation' created successfully or already exists.]
