In [1]:
import os 

In [2]:
%pwd

'c:\\Users\\rajat\\Desktop\\Project\\End_to_End_Mobile_Price_Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [7]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [13]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def clean_data(self):
            df = pd.read_csv(self.config.data_path)
            
            np.array(df["Screen Size (inches)"])
            df["Price ($)"]=df["Price ($)"].str.extract('(\d+)', expand=False)
            df["Price ($)"]=df["Price ($)"].astype(int)
            df["RAM "]=df["RAM "].str.replace("GB","")
            df["RAM "]=df["RAM "].astype(int)


            df["Storage "]=df["Storage "].str.replace("GB","")
            df["Storage "]=df["Storage "].astype(int)
            df["n_cameras"] = df["Camera (MP)"].str.count('\\+') + 1
            res1 = []
            res2 = []
            res3 = []
            res4 = []
            for x in df["Camera (MP)"]:
                 resolutions = x.split('+')
                 tam = len(resolutions)
                 if tam == 1:
                      res1.append(resolutions[0])
                      res2.append('0')
                      res3.append('0')
                      res4.append('0')
    
                 if tam == 2:
                      res1.append(resolutions[0])
                      res2.append(resolutions[1])
                      res3.append('0')
                      res4.append('0')
    
                 if tam == 3:
                      res1.append(resolutions[0])
                      res2.append(resolutions[1])
                      res3.append(resolutions[2])
                      res4.append('0')
    
                 if tam == 4:
                      res1.append(resolutions[0])
                      res2.append(resolutions[1])
                      res3.append(resolutions[2])
                      res4.append(resolutions[3])
    
            df['res1'] = res1
            df['res2'] = res2
            df['res3'] = res3
            df['res4'] = res4

            df= df.drop(columns='Camera (MP)')

            df['Screen Size (inches)'].replace(regex=True, inplace=True, to_replace=r'[^0-9.\-]', value=r'')
            cem1 = []
            cem2 = []
            cem3 = []
            for x in df['Screen Size (inches)']:
                 resolutions = x.split('.')
                 tam = len(resolutions)
                 if tam == 1:
                      cem1.append(resolutions[0])
                      cem2.append('0')
                      cem3.append('0')
    
                 if tam == 2:
                      cem1.append(resolutions[0])
                      cem2.append(resolutions[1])
                      cem3.append('0')
                 if tam == 3:
                      cem1.append(resolutions[0])
                      cem2.append(resolutions[1])
                      cem3.append(resolutions[2])
    
    
            df['cem1'] = cem1
            df['cem2'] = cem2
            df['cem3'] = cem3




            df= df.drop(columns='Screen Size (inches)')
            df["screen"] = df['cem1']+"."+ df["cem2"]
            df=df.drop(["cem1","cem2","cem3"],axis=1)

            df["screen"]=df["screen"].astype(float)
            df["res1"]=df["res1"].str.extract('(\d+)', expand=False)
            df["res1"]=df["res1"].astype(int)
            df["res2"]=df["res2"].str.extract('(\d+)', expand=False)
            df["res2"]=df["res2"].astype(int)
            df["res3"]=df["res3"].str.extract('(\d+)', expand=False)
            df["res3"]=df["res3"].astype(int)
            np.array(df["res4"])
            df["res4"]=df["res4"].str.extract('(\d+)', expand=False)
            df['res4'].isnull().sum()
            df['res4'] = df['res4'].fillna(0)
            df['res4'] = df['res4'].astype(int)
            df.rename(columns={"RAM ": "RAM", "Storage ": "Storage","Battery Capacity (mAh)": "Battery_Capacity"},inplace=True)

            
            label_encoder = LabelEncoder()
            df['Brand'] = label_encoder.fit_transform(df['Brand'])
            df = df.drop("Model", axis=1)
            return df
    

    def train_test_spliting(self,df):

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(df, test_size=0.25, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        

In [14]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = DataTransformation(config=data_transformation_config)
cleaned_df = data_transformation.clean_data()
# data_transformation.train_test_spliting(cleaned_df)
cleaned_df.columns

[2024-01-13 00:40:13,598: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-13 00:40:13,607: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-13 00:40:13,619: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-01-13 00:40:13,619: INFO: common: created directory at: artifacts]
[2024-01-13 00:40:13,627: INFO: common: created directory at: artifacts/data_transformation]


Index(['Brand', 'Storage', 'RAM', 'Battery_Capacity', 'Price ($)', 'n_cameras',
       'res1', 'res2', 'res3', 'res4', 'screen'],
      dtype='object')

In [17]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407 entries, 0 to 406
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             407 non-null    int32  
 1   Storage           407 non-null    int32  
 2   RAM               407 non-null    int32  
 3   Battery_Capacity  407 non-null    int64  
 4   Price ($)         407 non-null    int32  
 5   n_cameras         407 non-null    int64  
 6   res1              407 non-null    int32  
 7   res2              407 non-null    int32  
 8   res3              407 non-null    int32  
 9   res4              407 non-null    int32  
 10  screen            407 non-null    float64
dtypes: float64(1), int32(8), int64(2)
memory usage: 22.4 KB
