Data Ingestion

In [5]:
import pandas as pd
from dataclasses import dataclass

@dataclass
class DataIngestionConfig:
    train_data_path: str  # Specify the path to your training dataset
    test_data_path: str  # Specify the path to your testing dataset

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.ingestion_config = config

    def initiate_data_ingestion(self):
        try:
            train_data = pd.read_csv(self.ingestion_config.train_data_path)
            test_data = pd.read_csv(self.ingestion_config.test_data_path)

            # Handle missing values in the training and testing datasets as needed.

            return train_data, test_data
        except Exception as e:
            print(f"An error occurred: {str(e)}")

# Specify the paths to your training and testing datasets
train_data_path = r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\train.csv"
test_data_path = r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\test.csv"

# Create a DataIngestionConfig object and load the datasets
config = DataIngestionConfig(train_data_path=train_data_path, test_data_path=test_data_path)
data_ingestion = DataIngestion(config)
train_data, test_data = data_ingestion.initiate_data_ingestion()


Data Transfomation

In [6]:
import os
import joblib
from dataclasses import dataclass
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path: str = os.path.join('artifacts', "preprocessor.pkl")

class DataTransformation:
    def __init__(self):
        self.data_transformation_config = DataTransformationConfig()

    def get_data_transformer_object(self):
        try:
            # Define your numerical and categorical columns
            numerical_columns = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
            categorical_columns = ["Sex", "Embarked"]

            num_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())
                ]
            )

            cat_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("one_hot_encoder", OneHotEncoder(drop="first"))
                ]
            )

            # Combine num_pipeline and cat_pipeline using ColumnTransformer
            preprocessor = ColumnTransformer(
                transformers=[
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipeline", cat_pipeline, categorical_columns)
                ]
            )

            return preprocessor

        except Exception as e:
            print(f"An error occurred: {str(e)}")


    def initiate_data_transformation(self, train_data, test_data):
        try:
            # Assuming "Survived" is the target column in the training dataset
            target_column_name = "Survived"

            # Prepare your training and testing data
            X_train = train_data.drop(columns=[target_column_name], axis=1)
            y_train = train_data[target_column_name]
            X_test = test_data  # We don't have the target column in the testing data

            preprocessor = self.get_data_transformer_object()
            X_train_transformed = preprocessor.fit_transform(X_train)
            X_test_transformed = preprocessor.transform(X_test)

            # Save the preprocessor object using joblib
            joblib.dump(preprocessor, self.data_transformation_config.preprocessor_obj_file_path)

            return X_train_transformed, y_train, X_test_transformed

        except Exception as e:
            print(f"An error occurred: {str(e)}")


# Example usage:
# data_transformation = DataTransformation()
# X_train_transformed, y_train, X_test_transformed = data_transformation.initiate_data_transformation(train_data, test_data)


In [7]:
#Example usage:
data_transformation = DataTransformation()
X_train_transformed, y_train, X_test_transformed = data_transformation.initiate_data_transformation(train_data, test_data)


Model Training

In [19]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
# import xgboost as xgb
# from catboost import CatBoostRegressor
from joblib import dump

class ModelTrainer:
    def initiate_model_trainer(self, X_train, y_train, save_directory):
        # Split data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

        # Models to train
        models = [
            ('Random Forest', RandomForestRegressor(), {
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20],
            }),
            ('Decision Tree', DecisionTreeRegressor(), {
                'max_depth': [None, 10, 20],
            }),
            ('Gradient Boosting', GradientBoostingRegressor(), {
                'n_estimators': [100, 200, 300],
                'max_depth': [3, 4, 5],
            }),
            ('Linear Regression', LinearRegression(), {}),
            # ('XGBoost Regressor', xgb.XGBRegressor(), {
            #     'n_estimators': [100, 200, 300],
            #     'max_depth': [3, 4, 5],
            # }),
            # ('CatBoost Regressor', CatBoostRegressor(), {
            #     'iterations': [100, 200, 300],
            #     'depth': [6, 8, 10],
            # }),
            ('AdaBoost Regressor', AdaBoostRegressor(), {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 1.0],
            }),
        ]

        best_model = None
        best_mse = float('inf')  # Initialize with a large value

        for model_name, model, param_grid in models:
            # Hyperparameter Tuning
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)

            # Select the Best Model
            best_model_candidate = grid_search.best_estimator_
            val_predictions = best_model_candidate.predict(X_val)
            mse = mean_squared_error(y_val, val_predictions)

            # Check if this model is better than the current best
            if mse < best_mse:
                best_mse = mse
                best_model = best_model_candidate

        # Save the Best Model
        dump(best_model, f'{save_directory}/best_model.joblib')

        return best_model

# Usage
save_directory =r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\code\artifacts"
trainer = ModelTrainer()
best_model = trainer.initiate_model_trainer(X_train_transformed, y_train, save_directory)


In [9]:
import pandas as pd
from joblib import load

# Load the best model
best_model = load(r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\code\artifacts\best_model.joblib")  # Replace with the actual path

# Load the preprocessor to transform the test data
#preprocessor = load('/path/to/your/directory/preprocessor.pkl')  # Replace with the actual path

# Assuming "X_new" is the transformed test data obtained from DataTransformation
X_new = X_test_transformed  # Your transformed test data
X_new_test=pd.read_csv(r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\test.csv")
# Make predictions using the best model
predictions = best_model.predict(X_new)

# Apply a threshold of 0.7 to convert predictions to 0 or 1
predictions_binary = [1 if p >= 0.7 else 0 for p in predictions]

# Create a DataFrame with 'PassengerId' and 'Survived' columns
passenger_ids = X_new_test['PassengerId']  # Replace 'PassengerId' with the actual column name
output_df = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': predictions_binary})


# Specify the directory where you have write permissions
output_directory = r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\code\artifacts"  # Replace with the directory path where you want to save the output

# Save the DataFrame to a CSV file in the specified directory
output_df.to_csv(os.path.join(output_directory, 'output.csv'), index=False)


In [10]:
X_new_test=pd.read_csv(r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\test.csv")
X_new_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
# Create a DataFrame with 'PassengerId' and 'Survived' columns
passenger_ids = X_new_test['PassengerId']  # Replace 'PassengerId' with the actual column name
output_df = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': predictions_binary})


In [12]:
output_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [15]:
# Specify the directory where you have write permissions
output_directory = r"C:\Users\TRY'S COMPUTERS\Desktop\Titanic\code\artifacts"  # Replace with the directory path where you want to save the output

# Save the DataFrame to a CSV file in the specified directory
output_df.to_csv(os.path.join(output_directory, 'output.csv'), index=False)
