In [3]:
## importing library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
import pickle

In [4]:
    class Student_Predictor:
        def __init__(self,filepath,target_feature):
            self.filepath = filepath
            self.target_feature = target_feature
            self.model = None
            self.df = None
            self.X_train = None
            self.X_test = None
            self.y_train = None
            self.y_test = None
    
        def load_data(self):
            self.df = pd.read_csv(self.filepath)
            print("Data loaded sucessfully")
    
        def preprocessor(self):
            self.df.fillna(0,inplace=True)
            X = self.df.drop(self.target_feature,axis=1)
            y = self.df[self.target_feature]
            numerical_feature = X.select_dtypes(include=['int64']).columns
            categorical_feature = X.select_dtypes(include=['object']).columns
            self.X_train,self.X_test, self.y_train,self.y_test = train_test_split(X,y,test_size=0.20, random_state=42)
            num_pipeline = Pipeline(
                steps=[
                    ('imputer',SimpleImputer(strategy='median')),
                    ('scaler',StandardScaler())
                ]
            )
            cat_pipeline = Pipeline(
                steps=[
                    ("imputer",SimpleImputer(strategy='most_frequent')),
                    ("one_hot_encoder",OneHotEncoder(handle_unknown='ignore'))
                ]
            )
            preprocessor = ColumnTransformer(
                [
                    ("num",num_pipeline,numerical_feature),
                    ("cat",cat_pipeline,categorical_feature)
                ]
            )
            self.X_train = preprocessor.fit_transform(self.X_train)
            self.X_test = preprocessor.transform(self.X_test)
            self.preprocessor = preprocessor
            print("preprocessor sucessfully completed")
    
        def model_trainer(self):
            self.model = RandomForestRegressor()
            self.model = self.model.fit(self.X_train,self.y_train)
            print("model trained successfully")
    
        def evaluate_model(self):
            prediction = self.model.predict(self.X_test)
            r2 = r2_score(self.y_test,prediction)
            mse = mean_squared_error(self.y_test,prediction)
            mae = mean_absolute_error(self.y_test,prediction)
            print(f"r2 score:{r2}")
            print(f"Mean square error: {mse}")
            print(f"Mean absolute error:{mae}")
    
    
            ## saved model and preprocessor
        def saved_model(self,model_path='model.pkl',preprocessor_path='preprocessor.pkl'):
            with open(model_path,'wb') as model_file:
                pickle.dump(self.model, model_file)
    
            with open(preprocessor_path,'wb') as preprocessor_file:
                pickle.dump(self.preprocessor, preprocessor_file)
                print("saved model and preprocessor")
                
    
            ## load model and precessor 
        def load_model(self,model_path='model.pkl',preprocessor_path='preprocessor.pkl'):
            with open(model_path,'rb') as model_file:
                self.model = pickle.load(model_file)
    
            with open(preprocessor_path,'rb') as preprocessor_file:
                self.preprocessor = pickle.load(preprocessor_file)
        def predict_new_data(self, new_data):
            new_data_df = pd.DataFrame(new_data)
            transformer_preprocessor = self.preprocessor.transform(new_data_df)
            prediction = self.model.predict(transformer_preprocessor)
            print(f"Prediction with new data: {prediction}")
    
    
    
# After model is trained and loaded:
new_data = {
    'gender': ['Male'],
    'race_ethnicity': ['group C'],
    'parental_level_of_education': ['high school'],
    'lunch': ['standard'],
    'test_preparation_course': ['none'],
    'reading_score': [300],
    'writing_score': [70]}
    
        
        
    
    
            


In [5]:
obj = Student_Predictor(filepath="StudentsPer.csv",target_feature="math_score")
obj.load_data()
obj.preprocessor()
obj.model_trainer()
obj.evaluate_model()
#obj.saved_model()
#obj.load_model()
obj.predict_new_data(new_data)

Data loaded sucessfully
preprocessor sucessfully completed
model trained successfully
r2 score:0.8538559177146814
Mean square error: 35.56246715277778
Mean absolute error:4.635891666666667
Prediction with new data: [85.15]


In [59]:
print(obj.df.head())
obj.df['lunch'].value_counts()

   gender race_ethnicity parental_level_of_education  ... math_score reading_score  writing_score
0  female        group B           bachelor's degree  ...         72            72             74
1  female        group C                some college  ...         69            90             88
2  female        group B             master's degree  ...         90            95             93
3    male        group A          associate's degree  ...         47            57             44
4    male        group C                some college  ...         76            78             75

[5 rows x 8 columns]


standard        645
free/reduced    355
Name: lunch, dtype: int64