In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.simplefilter('ignore')

##pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV

#models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [2]:
class Abalone:

    def __init__(self, train_data = None, test_data = None):
        try:
            self.train_data = train_data
            self.test_data = test_data            
        except Exception as e:
            print({'Error':1, 'Message':f'__init__ function failed due to {e}'})

    def process(self):
        train_prediction = None
        try:
            train_data, test_data = self.reading_data(self.train_data, self.test_data)
            train_info, test_info = self.checking_infomation(train_data, test_data)
            train_null, test_null = self.checking_null(train_data, test_data)
            train_data_values, test_data_values = self.checking_value_counts(train_data, test_data)
            train_data_duplicates, test_data_duplicates = self.checking_duplicates(train_data, test_data)
            train_data_unique, test_data_unique = self.checking_unique(train_data, test_data)
            train_data, test_data = self.dropping_unneccasary_columns(train_data, test_data)
            train_data_columns, test_data_columns = self.checking_columns(train_data, test_data)
            X_train, y_train, X_test = self.splitting_train_and_test_data(train_data, test_data)
            train_categorical_cols, train_numerical_cols, test_categorical_cols, test_numerical_cols = self.splitting_categorical_and_numerical_columns(X_train, y_train, X_test)
            sex_categories = self.categorical_values()
            #train_categorical_cols, test_categorical_cols = self.converting_sex_column(train_categorical_cols, test_categorical_cols)
            preprocessor = self.pipeline(train_categorical_cols, train_numerical_cols, test_categorical_cols, test_numerical_cols, sex_categories)
            X_train_processed, X_test_processed = self.train_and_test_dataframe(X_train, X_test, preprocessor)
            train_prediction = self.model_training(X_train_processed, X_test_processed, y_train, X_test)
        except Exception as e:
            print({'Error':2, 'Message':f'process function failed due to {e}'})
        return train_prediction
    
    def reading_data(self, train_data_path, test_data_path):
        try:
            train_data = pd.read_csv(train_data_path)
            test_data = pd.read_csv(test_data_path)
        except Exception as e:
            print({'Error':3, 'Message':f'reading_data function failed due to {e}'})
            return None, None
        return train_data, test_data
    
    def checking_infomation(self, train_data, test_data):
        try:
            train_info = train_data.info()
            test_info = test_data.info()
        except Exception as e:
            print({'Error':4, 'Message':f'cheking_information function failed due to {e}'})
            return None, None
        return train_info, test_info
    
    def checking_null(self, train_data, test_data):
        try:
            train_null = train_data.isnull().sum()
            test_null = test_data.isnull().sum()
        except Exception as e:
            print({'Error':5, 'Message':f'cheking_null function failed due to {e}'})
            return None, None
        return train_null, test_null
    
    def checking_value_counts(self, train_data, test_data):
        try:
            train_data_values = train_data.value_counts()
            test_data_values = test_data.value_counts()
        except Exception as e:
            print({'Error':6, 'Message':f'cheking_value_counts function failed due to {e}'})
            return None, None
        return train_data_values, test_data_values
    
    def checking_duplicates(self, train_data, test_data):
        try:
            train_data_duplicates = train_data.duplicated().sum()
            test_data_duplicates = test_data.duplicated().sum()
        except Exception as e:
            print({'Error':7, 'Message':f'cheking_duplicates function failed due to {e}'})
            return None, None
        return train_data_duplicates, test_data_duplicates
    
    def checking_unique(self, train_data, test_data):
        try:
            train_data_unique = [train_data[i].unique() for i in train_data.columns]
            test_data_unique = [test_data[i].unique() for i in test_data.columns]

        except Exception as e:
            print({'Error':8, 'Message':f'checking_unique function failed due to {e}'})
            return None, None
        return train_data_unique, test_data_unique
    
    def dropping_unneccasary_columns(self, train_data, test_data):
        try:
            train_data = train_data.drop(['id'], axis=1)
            test_data = test_data.drop(['id'], axis=1)
        except Exception as e:
            print({'Error':9, 'Message':f'dropping_unneccasary_columns function failed due to {e}'})
            return None, None
        return train_data, test_data
    
    def checking_columns(self, train_data, test_data):
        try:
            train_data_columns = train_data.columns
            test_data_columns = test_data.columns
        except Exception as e:
            print({'Error':10, 'Message':f'checking_columns function failed due to {e}'})
            return None, None
        return train_data_columns, test_data_columns

    def splitting_train_and_test_data(self, train_data, test_data):
        try:
            X_train = train_data.drop(['Rings'], axis=1)
            y_train = train_data['Rings']
            X_test = test_data
        except Exception as e:
            print({'Error':11, 'Message':f'splitting_train_and_test_data function failed due to {e}'})
            return None, None, None
        return X_train, y_train, X_test
    
    def splitting_categorical_and_numerical_columns(self, X_train, y_train, X_test):
        try:
            train_categorical_cols = X_train.select_dtypes(include='O').columns
            train_numerical_cols = X_train.select_dtypes(exclude='O').columns
            test_categorical_cols = X_test.select_dtypes(include='O').columns
            test_numerical_cols = X_test.select_dtypes(exclude='O').columns
        except Exception as e:
            print({'Error': 12, 'Message': f'splitting_categorical_and_numerical_columns function failed due to {e}'})
            return None, None, None, None
        return train_categorical_cols, train_numerical_cols, test_categorical_cols, test_numerical_cols

    
    def categorical_values(self):
        try:
            sex_categories = ['F', 'M', 'I']
        except Exception as e:
            print({'Error':13, 'Message':f'categorical_values function failed due to {e}'})
            return None
        return sex_categories
    
    '''def converting_sex_column(self, train_categorical_cols,test_categorical_cols):
        try:
            labelencoder = LabelEncoder()
            train_categorical_cols['Sex'] = labelencoder.fit_transform(train_categorical_cols['Sex'])
            test_categorical_cols['Sex'] = labelencoder.fit_transform(test_categorical_cols['Sex'])

        except Exception as e:
            print({'Error':14, 'Message':f'categorical_values function failed due to {e}'})
            return None
        return train_categorical_cols, test_categorical_cols'''
    
    def pipeline(self, train_categorical_cols, train_numerical_cols, test_categorical_cols, test_numerical_cols, sex_categories):
        try:
            numerical_pipeline = Pipeline(
                steps = [
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaling', StandardScaler(with_mean=False))
                ]
            )

            categorical_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('encoder', OneHotEncoder(categories=[sex_categories], handle_unknown='ignore'))
                ]
            )

            preprocessor = ColumnTransformer([
                ('train_numerical_pipeline', numerical_pipeline, train_numerical_cols),
                ('train_categorical_pipeline', categorical_pipeline, train_categorical_cols),
                ('test_numerical_pipeline', numerical_pipeline, test_numerical_cols),
                ('test_categorical_pipeline', categorical_pipeline, test_categorical_cols)
            ])

        except Exception as e:
            print({'Error':15, 'Message':f'pipeline function failed due to {e}'})
            return None
        return preprocessor
    
    def train_and_test_dataframe(self, X_train, X_test, preprocessor):
        try:
            X_train_processed = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
            X_test_processed = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())
        except Exception as e:
            print({'Error':16, 'Message':f'train_and_test_dataframe function failed due to {e}'})
            return None, None
        return X_train_processed, X_test_processed

    def evaluate_metrics(self, true, predicted):
        try:
            r2 = r2_score(true, predicted)
            mae = mean_absolute_error(true, predicted)
            mse = mean_squared_error(true, predicted)
            #rmsle = np.sqrt(mean_squared_log_error(true, predicted))
        except Exception as e:
            print({'Error': 17, 'Message': f'evaluate_metrics function failed due to {e}'})
            return None, None, None, None
        return r2, mae, mse

    def model_training(self, X_train_processed, X_test_processed, y_train, test_data):
        try:
            models = {
                # 'Linear Regression': LinearRegression(),
                # "Lasso": Lasso(),
                # "Ridge": Ridge(),
                # "ElasticNet": ElasticNet(),
                'Random Forest': RandomForestRegressor(),
                # 'Support Vector Machine': SVR(),
                # 'Decision Tree': DecisionTreeRegressor(),
                # 'Ada Boost': AdaBoostRegressor(),
                # 'Gradient Boost': GradientBoostingRegressor()
            }

            trained_model_list = []
            model_list = []
            r2_list = []

            # Define hyperparameter grids for each model
            param_grids = {
                # 'Linear Regression': {},
                # 'Lasso': {'alpha': [0.1, 1, 10], 'max_iter': [1000, 2000], 'tol': [0.001, 0.0001]},
                # 'Ridge': {'alpha': [0.1, 1, 10], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 'max_iter': [1000, 2000]},
                # 'ElasticNet': {'alpha': [0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9], 'max_iter': [1000, 2000], 'tol': [0.001, 0.0001]},
                'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt', 'log2'], 'bootstrap': [True, False], 'random_state': [42]},
                # 'Support Vector Machine': {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']},
                # 'Decision Tree': {'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt', 'log2'], 'random_state': [42]},
                # 'Ada Boost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1], 'loss': ['linear', 'square', 'exponential']},
                # 'Gradient Boost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1], 'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['auto', 'sqrt', 'log2'],'random_state': [42]}
            }

            # Perform Grid Search and hyperparameter tuning for each model
            for model_name, model in models.items():
                param_grid = param_grids.get(model_name, {})  # Get hyperparameter grid for the current model
                if param_grid:
                    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
                    grid_search.fit(X_train_processed, y_train)
                    best_params = grid_search.best_params_
                    ##
                    models[model_name].set_params(**best_params)  # Update model with best hyperparameters
                    ##
                    #model.set_params(**best_params)  # Update model with best hyperparameters
                    # Print best_params within the loop 
                    print("Best parameters for", model_name, ":", best_params)
                else:
                    # Print best_params for models without hyperparameter grid
                    print("Best parameters for", model_name, ":", model.get_params())


            for model_name, model in models.items():
                model.fit(X_train_processed, y_train)

                y_train_pred = model.predict(X_train_processed)
                r2, mae, mse = self.evaluate_metrics(y_train, y_train_pred)

                print(model_name)
                model_list.append(model_name)

                print("MODEL TRAINING PERFORMANCE")
                print("R2_SQUARE", r2 * 100)
                print("MAE:", mae)
                print("MAE:", mse)

                r2_list.append(r2)

                y_test_pred = model.predict(X_test_processed)
                #y_test_pred = y_test_pred.round()
                print(y_test_pred)
                print("-" * 35)
                print("\n")

                result = pd.DataFrame()
                test_data_id = pd.read_csv('test.csv')
                result['id'] = test_data_id['id']
                result['Rings'] = y_test_pred
                result.to_csv(f'{model_name}submission.csv', index=False)

        except Exception as e:
            print({'Error': 18, 'Message': f'model_training function failed due to {e}'})
            return None
        return y_train_pred

In [3]:
if __name__ == '__main__':
    train_data_path = 'train.csv'
    test_data_path = 'test.csv'
    y_train_pred = Abalone(train_data=train_data_path, test_data=test_data_path).process()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              90615 non-null  int64  
 1   Sex             90615 non-null  object 
 2   Length          90615 non-null  float64
 3   Diameter        90615 non-null  float64
 4   Height          90615 non-null  float64
 5   Whole weight    90615 non-null  float64
 6   Whole weight.1  90615 non-null  float64
 7   Whole weight.2  90615 non-null  float64
 8   Shell weight    90615 non-null  float64
 9   Rings           90615 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 6.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60411 entries, 0 to 60410
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              60411 non-null  int64  
 1   Sex             60411 non-null  object 
 

In [None]:
y_train_pred

array([10.6602141 , 10.61715429,  4.27773277, ...,  7.14080708,
        6.89723363,  7.52806395])