In [45]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [46]:
class DataPreprocessor:
    def __init__(self,path):
        self.path=path
        self.data=None
    def load_data(self):
        self.data=pd.read_csv(self.path)
    def handle_missing_values(self):
        self.data['age']=self.data['age'].fillna(self.data['age'].mean())
        self.data['body']=self.data['body'].fillna(self.data['body'].mean())
        self.data=self.data.dropna(subset=['cabin','boat'])
        self.data['home.dest']=self.data['home.dest'].fillna('Unknown')
       
        cleaned_data=self.data
        return cleaned_data
   
    def encode_categorical(self):
        le_encoder=LabelEncoder()
        le_encoder.fit(self.data['sex'])
        self.data['sex']=le_encoder.transform(self.data['sex'])
        em_encoder=LabelEncoder()
        em_encoder.fit(self.data['embarked'].astype(str))
        self.data['embarked']=em_encoder.transform(self.data['embarked'].astype(str))
        return self.data
    def split_data(self,test_size : float,random_state : int):
        x=self.data.drop('survived',axis=1)
        y=self.data['survived']
        X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=test_size,random_state=random_state)
        return X_train,X_test,y_train,y_test
    def scale_features(self):
        scaler=StandardScaler()
        self.data[['pclass','sex','age','sibsp','fare','embarked']]=scaler.fit_transform(self.data[['pclass','sex','age','sibsp','fare','embarked',]])
        return self.data
    def save_processed_data(self):
        self.data.to_csv('processed_data.csv',index=False)
   

In [59]:
def main():
    if __name__ == '__main__':
        d1=DataPreprocessor('titanic3.csv')  
        d1.load_data()
        data=d1.handle_missing_values()
        print(f'{'*'*50} Cleaned data {'*'*50} \n\n  {data}')
        data=d1.encode_categorical()
        print(f" {'*'*50} Encoded data : {'*'*50} \n\n {data}")
        d1.scale_features()
        print(f" {'*'*50} Scaled data : {'*'*50} \n\n {data}")
        d1.save_processed_data()
        print('*'*50 , "File has been save" , '*'*50)
        X_train,X_test,y_train,y_test=d1.split_data(0.3,42)
        print('*'*50 , "data set has been  splited " , '*'*50)
        print(data.isnull().sum())
        return data,X_train,X_test,y_train,y_test
        

In [61]:
data,X_train,X_test,y_train,y_test=main()

************************************************** Cleaned data ************************************************** 

        pclass  survived                                               name  \
0          1         1                      Allen, Miss. Elisabeth Walton   
1          1         1                     Allison, Master. Hudson Trevor   
5          1         1                                Anderson, Mr. Harry   
6          1         1                  Andrews, Miss. Kornelia Theodosia   
8          1         1      Appleton, Mrs. Edward Dale (Charlotte Lamson)   
...      ...       ...                                                ...   
1122       3         1                                  Peter, Miss. Anna   
1131       3         1                 Pickard, Mr. Berk (Berk Trembisky)   
1187       3         1                    Sandstrom, Miss. Beatrice Irene   
1188       3         1  Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...   
1189       3         1            

In [54]:
data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,-0.368850,1,"Allen, Miss. Elisabeth Walton",-0.679795,-0.352764,-0.833587,0,24160,1.478942,B5,0.875612,2,160.809917,"St Louis, MO"
1,-0.368850,1,"Allison, Master. Hudson Trevor",1.471032,-2.234457,0.684170,2,113781,0.764621,C22 C26,0.875612,11,160.809917,"Montreal, PQ / Chesterville, ON"
5,-0.368850,1,"Anderson, Mr. Harry",1.471032,0.920313,-0.833587,0,19952,-0.728837,E12,0.875612,3,160.809917,"New York, NY"
6,-0.368850,1,"Andrews, Miss. Kornelia Theodosia",-0.679795,1.925373,0.684170,0,13502,-0.114628,D7,0.875612,10,160.809917,"Hudson, NY"
8,-0.368850,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",-0.679795,1.255333,2.201928,0,11769,-0.430991,C101,0.875612,D,160.809917,"Bayside, Queens, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,3.699043,1,"Peter, Miss. Anna",-0.679795,-0.293724,0.684170,1,2668,-0.778918,F E69,-1.124307,D,160.809917,Unknown
1131,3.699043,1,"Pickard, Mr. Berk (Berk Trembisky)",1.471032,-0.151752,-0.833587,0,SOTON/O.Q. 392078,-0.949869,E10,0.875612,9,160.809917,Unknown
1187,3.699043,1,"Sandstrom, Miss. Beatrice Irene",-0.679795,-2.228876,0.684170,1,PP 9549,-0.846521,G6,0.875612,13,160.809917,Unknown
1188,3.699043,1,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...",-0.679795,-0.687784,-0.833587,2,PP 9549,-0.846521,G6,0.875612,13,160.809917,Unknown
