In [57]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

In [5]:
class AutoMl():
    def __init__(self,df,target,Estimator=None,test_size=0.2,strategy='mean',std_scale=True,model_train=True,type_model='Classification'):
        """"
        1. df   ::  Data with target column
        2. target ::  target Column Name
        3. Estimator :None :: By default None, here pass the model instance
        4. test_size :0.2 ::Default test size 0.2. Change test size using this parameter
        5. strategy :mean ::Deafault strategy for numeric features `mean` ['mean','median']
                   and use categorical features `mode'
        
        6. std_scale : True :: True >> standard Scalar used for scaling
                            False >> Min-Max Scalar used for scaling
        7. model_train: True ::True >> Model training True return the training result of model
                            False >> Return the data after preprocessing array
                              >> x_train,x_test,y_train,y_test
        8. type_model:'Classification' :: select type of model `Regression` or 'Classification`
        -----------------------------------------------------------------------------------------------
        return ::
            1. train_test_data_split >> split the data into x_train,x_test,y_train,y_test
            2. create_model >> after used all preprocessing and model return
    
        Copyright (c) 2022 Sanket Suresh Bodake
        """
        import warnings
        warnings.filterwarnings('ignore')
        self.df=df.drop(target,axis=1)
        self.target=df[target]
        self.test_size=test_size
        self.strategy=strategy
        self.scale=std_scale
        self.model_train=model_train
        self.Estimator=Estimator
        self.type_model=type_model
    def train_test_data_spilt(self):
        from sklearn.model_selection import train_test_split
        x_train,x_test,y_train,y_test=train_test_split(self.df,self.target,test_size=self.test_size,random_state=10)
        return x_train,x_test,y_train,y_test
    def create_model(self):
        from sklearn.compose import ColumnTransformer
        from sklearn.preprocessing import StandardScaler,MinMaxScaler
        from sklearn.preprocessing import OneHotEncoder
        from sklearn.pipeline import Pipeline
        from sklearn.impute import SimpleImputer
        x_train,x_test,y_train,y_test=self.train_test_data_spilt()
        numeric_feature=list(self.df.select_dtypes(include=['int64','float64']).columns)
#         numeric_feature=["age","fnlwgt","education-num","capital-gain","capital-loss","hours-per-week"]
        cat_feature=list(self.df.select_dtypes(include=['object']).columns)
#         print(cat_feature)
        std_scalar_pipeline=Pipeline(steps=[('missingvaluehandling',SimpleImputer(strategy=self.strategy)),
                                               ('std_scaler',StandardScaler(with_mean=True))
                   ])
        min_max_scalar_pipeline=Pipeline(steps=[('missingvaluehandling',SimpleImputer(strategy=self.strategy)),
                                                   ('std_scaler',MinMaxScaler())
                   ])   
        categorical_pipeline=Pipeline(steps=[('catmissingvaluehandling',SimpleImputer(strategy="most_frequent")),
                                               ('OneHotencoding', OneHotEncoder(handle_unknown="ignore"))
                   ])  

        def model_evalution(model,X,y_true):
            y_pred=model.predict(X)
            from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
            acc_score=accuracy_score(y_true,y_pred)
            print(f"Accuracy Score of Model ::{acc_score}")
            conf_matrix=confusion_matrix(y_true,y_pred)
            print(f"confusion matrix  of Model ::\n{conf_matrix}")
            class_matrix=classification_report(y_true,y_pred)
            print(f"classification report of Model ::\n{class_matrix}") 
        def model_evalution1(model,X,y_true):
                from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
                import numpy as np
                y_pred=model.predict(X)
                mse=mean_squared_error(y_true,y_pred)
                print(f"The MSE of Model ::{mse}")
                mae=mean_absolute_error(y_true,y_pred)
                print(f"The MAE of Model ::{mae}")
                rsme=np.sqrt(mse)
                print(f"The RMSE of Model ::{rsme}")
                r2_value=r2_score(y_true,y_pred)
                print(f"The R2 of Model ::{r2_value}")
                
        if self.scale==True:
            print("Standard Scalar ...")
            preprocessor=ColumnTransformer(transformers=[('numeric_value',std_scalar_pipeline,numeric_feature),
                                    ("cat", categorical_pipeline, cat_feature)])
            X_train=preprocessor.fit_transform(x_train)
            X_test=preprocessor.transform(x_test)
        else:
            print("Min-Max Scaler ....")
            preprocessor=ColumnTransformer(transformers=[('numeric_value',min_max_scalar_pipeline,numeric_feature),
                                    ("cat", categorical_pipeline, cat_feature)])
            X_train=preprocessor.fit_transform(x_train)
            X_test=preprocessor.transform(x_test) 
        if self.model_train==True:
            from sklearn import set_config
            model_pipe=Pipeline(steps=[('preprcess',preprocessor),
                                  ('model',self.Estimator)])
            model_pipe.fit(x_train,y_train)
            if self.type_model=='Classification':
                print()
                set_config(display="diagram")
                print('*'*20+'Training Data Evalution','*'*20)
                model_evalution(model_pipe,x_train,y_train)
                print('*'*20+'Testing Data Evalution','*'*20)
                model_evalution(model_pipe,x_test,y_test) 
                return model_pipe
            else:
                print()
                set_config(display="diagram")
                print('*'*20+'Training Data Evalution','*'*20)
                model_evalution1(model_pipe,x_train,y_train)
                print('*'*20+'Testing Data Evalution','*'*20)
                model_evalution1(model_pipe,x_test,y_test)
                return model_pipe
        else:
            return X_train,X_test,y_train,y_test


In [6]:
import pandas as pd
df=pd.read_csv(r"F:\New folder\salary.csv")
df['salary']=df['salary'].replace({' <=50K': 0, ' >50K': 1})
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier    
log_model=LogisticRegression()
rf_model=RandomForestClassifier()
ob=AutoMl(df,'salary',log_model,model_train=True)
ob.create_model()

Standard Scalar ...

********************Training Data Evalution ********************
Accuracy Score of Model ::0.8542690417690417
confusion matrix  of Model ::
[[18456  1335]
 [ 2461  3796]]
classification report of Model ::
              precision    recall  f1-score   support

           0       0.88      0.93      0.91     19791
           1       0.74      0.61      0.67      6257

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.79     26048
weighted avg       0.85      0.85      0.85     26048

********************Testing Data Evalution ********************
Accuracy Score of Model ::0.8472286196837095
confusion matrix  of Model ::
[[4593  336]
 [ 659  925]]
classification report of Model ::
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4929
           1       0.73      0.58      0.65      1584

    accuracy                           0.85      6513
   macro avg       0.80      

In [None]:
# Only preprocess data return in train and test

In [7]:
ob=AutoMl(df,'salary',model_train=False)
x_train,x_test,y_train,y_test=ob.create_model()

Standard Scalar ...


In [None]:
# Random Forest Classifier

In [8]:
ob=AutoMl(df,'salary',rf_model,std_scale=False,model_train=True,test_size=0.1)
ob.create_model()

Min-Max Scaler ....

********************Training Data Evalution ********************
Accuracy Score of Model ::0.9999317499317499
confusion matrix  of Model ::
[[22273     0]
 [    2  7029]]
classification report of Model ::
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22273
           1       1.00      1.00      1.00      7031

    accuracy                           1.00     29304
   macro avg       1.00      1.00      1.00     29304
weighted avg       1.00      1.00      1.00     29304

********************Testing Data Evalution ********************
Accuracy Score of Model ::0.8455634019035922
confusion matrix  of Model ::
[[2253  194]
 [ 309  501]]
classification report of Model ::
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      2447
           1       0.72      0.62      0.67       810

    accuracy                           0.85      3257
   macro avg       0.80      

In [None]:
# Regression Model

In [9]:
from sklearn.linear_model import LinearRegression
df1=pd.read_csv(r"F:\New folder\Pune_rent.csv")
df1['bedroom']=df1['bedroom'].astype('float64')
df1['price']=df1['price'].apply(lambda x:x.replace(',','')).astype('float64')
df1['area']=df1['area'].astype('float64')
lin_model=LinearRegression()
ob=AutoMl(df1,'price',lin_model,std_scale=True,model_train=True,test_size=0.1,type_model='Regression')
ob.create_model()

Standard Scalar ...

********************Training Data Evalution ********************
The MSE of Model ::53611760.532456726
The MAE of Model ::3816.0515807078145
The RMSE of Model ::7322.005226197037
The R2 of Model ::0.619960360490876
********************Testing Data Evalution ********************
The MSE of Model ::55626022.609872945
The MAE of Model ::3843.471239551705
The RMSE of Model ::7458.28550069471
The R2 of Model ::0.5954307806492034
