<a href="https://colab.research.google.com/github/Ovizero01/Machine-Leaning/blob/main/027_Model%20Optimization%20%26%20Experiment%20Tracking%20in%20Machine%20Learning/Student%20performance%20prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#sklearn preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


#Regression model

from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor



from sklearn.ensemble import VotingRegressor, StackingRegressor


#metrices

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd. read_csv("/content/bangladesh_student_performance_2018.csv")

In [None]:
df

# Y Data profiling

In [None]:
!pip install ydata-profiling


In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport( df , title="Bangladesh Student performance prediction", explorative = True  )

profile.to_file("ydata.html")

In [None]:
len(df.columns)

In [None]:
len(df)

#simply columns

In [None]:
df.columns= [
    'date','st_gender','st_age','st_address',
    'st_fam_size','pstatus','m_edu','f_edu',
    'm_job','f_job','st_relationship',
    'smoker','tuition_fee','time_with_friends',
    'ssc_result','hsc_result'
]

In [None]:
df.columns.size

#drop data

In [None]:
if 'date' in df.columns:
  df.drop( columns = ['date'], inplace=True )

In [None]:
df.columns.size

#Size

In [None]:
df.shape

# Correlatin for Numerical Value

In [None]:
corr_target = df.select_dtypes(include=np.number).corr()['hsc_result'].sort_values(ascending=False)
print(corr_target)

# Separate X and y

In [None]:
X = df.drop('hsc_result',axis=1)
y = df['hsc_result']

#Numerical Column and Categorical Columns

In [None]:
numeric_features = X.select_dtypes(include = ['int64','float64']).columns
categorical_features = X.select_dtypes(include = ['object']).columns

In [None]:
numeric_features

In [None]:
categorical_features

#pipeline

In [None]:
#for numerical features

num_transformer = Pipeline (
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# For categorical feature

In [None]:
cat_transformer = Pipeline( steps = [
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
] )

In [None]:
#combine them
preprocessor = ColumnTransformer(
    transformers= [
        ('num',num_transformer,numeric_features),
        ('cat',cat_transformer,categorical_features)
    ]
    )

In [None]:
#split them

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2 , random_state=42)

# Ensemble - boosting, stacking

In [None]:
#base learner

reg_lr = LinearRegression()
reg_rf = RandomForestRegressor( n_estimators=100, random_state=42 )
reg_gb = GradientBoostingRegressor( n_estimators=100 , random_state=42 )

In [None]:
#Voting regressor

voting_reg = VotingRegressor(
    estimators= [
        ('lr', reg_lr),
        ('rf',reg_rf),
        ('gb', reg_gb)
    ]
)

In [None]:
#stacking

stacking_reg = StackingRegressor(
    estimators= [
        ('rf',reg_rf),
        ('gb', reg_gb)
    ],
    final_estimator= Ridge() #the meta learner
)


# Model Training

In [None]:
#dictionary of all model

model_to_train = {
    'Linear Regression' : reg_lr,
    'Random Forest' : reg_rf,
    'Gradient Boosting': reg_gb,
    'Voting Ensemble ' : voting_reg,
    'Stacking Ensemble ' : stacking_reg

}

In [None]:
#training & Evaluation

result = []

for name , model in model_to_train.items():
  #create full pipeline with preprocessor
  pipe = Pipeline(
      [
          ('preprocessor', preprocessor),
          ('model',model)
      ]
  )

  #train

  pipe.fit(X_train,y_train)

  #predict

  y_pred = pipe.predict(X_test)

  #Evaluate

  r2 = r2_score(y_test,y_pred)
  rmse = np.sqrt(mean_squared_error(y_test,y_pred))
  mae = mean_absolute_error(y_test,y_pred)

  result.append({
      "Model": name,
      "R2 Score" :r2,
      "RMSE": rmse,
      "MAE" : mae
  })

results_df = pd.DataFrame(result).sort_values("R2 Score", ascending=False)

print(results_df)

# Visualization

In [None]:
best_model_name = results_df.iloc[0]['Model']
best_model_obj = model_to_train[best_model_name]


#fit the best model

final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model',best_model_obj)
])

final_pipe.fit(X_train,y_train)
y_final_pred = final_pipe.predict(X_test)


#plot Actual vs predicted

plt.figure( figsize = (8,6) )

sns.scatterplot(x=y_test, y=y_final_pred, alpha = 0.6, color='teal' )
plt.plot( [2,5] , [2,5], color = "red", linestyle = '--'  )

plt.xlabel("Actual HSC Result")
plt.ylabel("Predicted HSC result")

plt.grid(True)
plt.show()

#Cross Valiadation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
rf_pipeline = Pipeline(
    [
        ('preprocessor',preprocessor),
        ('model',RandomForestRegressor(n_estimators=100,random_state=42))

     ]

  )

In [None]:
# cross validation

In [None]:
# 5 fold cv

cv_scores = cross_val_score( rf_pipeline,X_train,y_train,cv=5, scoring='neg_mean_squared_error' )
cv_rmse = np.sqrt(-cv_scores)

print(cv_rmse)

In [None]:
print(cv_rmse.mean())

In [None]:
print(cv_rmse.std())

# Stacking Ensemble

In [None]:
stacking_pipeline = Pipeline(
    [
        ('preprocessor',preprocessor),
        ('model',stacking_reg)

     ]

  )

In [None]:
cv_scores = cross_val_score(
    stacking_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs = -1 #use all processor core , to speed it up
    )

stacking_rmse = np.sqrt(-cv_scores)

print(stacking_rmse)

# Grid search cv

In [None]:
rf_pipeline = Pipeline(
    [
        ('preprocessor',preprocessor),
        ('model',RandomForestRegressor(n_estimators=100,random_state=42))

     ]

  )

In [None]:
#define the grid

param_grid = {
    'model__n_estimators' : [100,200] ,
    'model__max_depth': [None,10,20],
    'model__min_samples_split' : [2,5]
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    estimator = rf_pipeline,
    param_grid = param_grid,
    cv = 5 ,
    scoring = 'neg_root_mean_squared_error',
    n_jobs =-1,
    verbose = 2

)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
print(-grid_search.best_score_)

In [None]:
print(grid_search.best_params_)

# Randomized Search cv

In [None]:
from scipy.stats import randint

my_dist = randint(1,10)

print(my_dist.rvs())
print(my_dist.rvs())
print(my_dist.rvs(size=5))

In [None]:
param_dist = {
    'model__n_estimators' : randint(100,500) ,
    'model__max_depth': [None,10,20],
    'model__min_samples_split' : randint(1,10)
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator = rf_pipeline,
    param_distributions = param_dist,
    n_iter=1,
    cv = 5 ,
    scoring = 'neg_root_mean_squared_error',
    n_jobs =-1,
    verbose = 2,
    random_state = 42
)

In [None]:
random_search.fit(X_train,y_train)

In [None]:
print( -random_search.best_score_ )

In [None]:
print( random_search.best_params_ )

# Save Model

In [None]:
import pickle
from sklearn.linear_model import LinearRegression

X_train_lr = [ [1],[2],[3],[4],[5] ]  #study
y_train_lr = [10,20,30,40,50] #marks

model = LinearRegression()

model.fit(X_train_lr,y_train_lr)

In [None]:
model.predict( [ [6] ] )[0]

In [None]:
filename = "model.pkl"

with open( filename, "wb" ) as file:
  pickle.dump( model, file )

In [None]:
#load

with open( "/content/model.pkl", "rb" ) as file:
  loaded_model = pickle.load(file)

In [None]:
loaded_model.predict( [[7 ]] )

#random forest

In [None]:
filename = "random_forest_model.pkl"

with open( filename, "wb" ) as file:
  pickle.dump( random_search, file )

In [None]:
with open( "/content/random_forest_model.pkl", "rb" ) as file:
  rf_loaded_model = pickle.load(file)

In [None]:
rf_loaded_model.predict(X_test)

# MLFLOW

In [None]:
!pip install mlflow

# Let's Discover MLflow


In [None]:
import mlflow

In [None]:
mlflow.set_tracking_uri("file:///content/mlruns")
mlflow.set_experiment("test_run")

with mlflow.start_run( run_name = "Dummy_test" ):

  #metric
  mlflow.log_metric( "Accuracy", 0.95 )
  mlflow.log_metric( "Loss", 0.05 )

  #parameter

  mlflow.log_param("model_type","fake_model_v1")
  mlflow.log_param("Learning Rate",0.001)



# RF using MLFLOW

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

mlflow.set_experiment("Student performance using rf")

my_params = {
    'n_estimators' :100,
    'max_depth' :10,
    'random_state' : 42
}

simple_rf_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor(**my_params))
])


#start mlflow

with mlflow.start_run( run_name = "Single_rf"):

  mlflow.log_params(my_params)

  mlflow.log_param("model_type","RandomForestRegressor")

  #train
  simple_rf_pipeline.fit(X_train,y_train)

  y_train_pred = simple_rf_pipeline.predict(X_train)
  train_rmse = np.sqrt(mean_squared_error(y_train,y_train_pred))

  #log train details

  mlflow.log_metric("train rmse",train_rmse)

  #test

  y_test_pred = simple_rf_pipeline.predict(X_test)
  test_rmse = np.sqrt(mean_squared_error(y_test,y_test_pred))

  mlflow.log_metric("test_rmse",test_rmse)







In [None]:
test_rmse