In [4]:
import numpy as np
import pandas as pd

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler


In [30]:
 def get_data_transformer_object():
        '''
        This function si responsible for data trnasformation
        
        '''
        numerical_columns = ["writing_score", "reading_score"]
        categorical_columns = [
                "gender",
                "race_ethnicity",
                "parental_level_of_education",
                "lunch",
                "test_preparation_course",
            ]

        num_pipeline= Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="median")),
                ("scaler",StandardScaler())

                ]
            )

        cat_pipeline=Pipeline(

                steps=[
                ("imputer",SimpleImputer(strategy="most_frequent")),
                ("one_hot_encoder",OneHotEncoder()),
                ]

            )

       

        preprocessor=ColumnTransformer(
                [
                ("num_pipeline",num_pipeline,numerical_columns),
                ("cat_pipelines",cat_pipeline,categorical_columns)

                ]


            )

        return preprocessor

In [32]:
train_df = pd.read_csv('artifacts/train.csv')
test_df  = pd.read_csv('artifacts/test.csv')

In [33]:
preprocessing_obj=get_data_transformer_object()

In [34]:
target_column_name="math_score"

In [35]:
input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
input_feature_train_df.shape

(800, 7)

In [36]:
target_feature_train_df=train_df[target_column_name]

In [37]:
input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test_df[target_column_name]

In [38]:
input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

In [40]:
input_feature_train_arr.shape

(800, 19)

In [41]:
input_feature_test_arr.shape

(200, 19)

In [51]:
target_feature_train_df.shape

(800,)

In [44]:
train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]

In [61]:
train_arr[1]

array([ 0.96470125,  0.9302895 ,  1.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        , 66.        ])

In [53]:
input_feature_train_arr

array([[ 0.43405338,  0.03079054,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.96470125,  0.9302895 ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.1636942 ,  1.34544287,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.02522827, -0.52274728,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [-1.02522827, -1.49143847,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.36268716,  1.48382733,  1.        , ...,  1.        ,
         0.        ,  1.        ]])

In [62]:
pd.DataFrame(train_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.434053,0.030791,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,62.0
1,0.964701,0.930290,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,66.0
2,1.163694,1.345443,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,79.0
3,-0.162925,-0.176786,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,61.0
4,-0.759904,-0.384363,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,2.092328,2.106557,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,87.0
796,-0.494580,-0.453555,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,69.0
797,-1.025228,-0.522747,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,53.0
798,-1.025228,-1.491438,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,50.0


In [2]:
report = {'Random Forest': [0.9768311465252886, 0.848122541489547], 'Decision Tree': [0.9996534669718089, 0.7474299165277655], 'Gradient Boosting': [0.9050396644022572, 0.8722935106254908], 'Linear Regression': [0.8743172040139593, 0.8804332983749564], 'XGBRegressor': [0.9954995444196413, 0.8230898008444014], 'CatBoosting Regressor': [0.9587006447072537, 0.8523560006768236], 'AdaBoost Regressor': [0.8500466364086933, 0.8501466656875872]}

In [5]:
pd.DataFrame(report,axis=0)

TypeError: DataFrame.__init__() got an unexpected keyword argument 'axis'

In [19]:
df = pd.DataFrame.from_dict(data=report,orient='index').reset_index()
df.rename(columns={'index':'models',0:'training_accuracy',1:'test_accuracy'},inplace=True)

In [20]:
df

Unnamed: 0,models,training_accuracy,test_accuracy
0,Random Forest,0.976831,0.848123
1,Decision Tree,0.999653,0.74743
2,Gradient Boosting,0.90504,0.872294
3,Linear Regression,0.874317,0.880433
4,XGBRegressor,0.9955,0.82309
5,CatBoosting Regressor,0.958701,0.852356
6,AdaBoost Regressor,0.850047,0.850147


In [25]:
max_accuracy = df['test_accuracy'].values.max()
index = np.where(df['test_accuracy'] == max_accuracy)[0][0]
index

3

In [27]:
best = df.iloc[index]

In [28]:
best.models

'Linear Regression'