In [25]:
import pandas as pd
import matplotlib.pyplot as plt # To create the plot
import seaborn as sns
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

#basline model
from sklearn.dummy import DummyRegressor

# This is for regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

# This is for Pipline
from sklearn.pipeline import Pipeline 
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# we used for the Grid search model
from sklearn.model_selection import GridSearchCV 

### Read the dataset

In [3]:
tadawul_stcks = pd.read_csv('/Users/fatimah/Desktop/Tadawul_stcks_clean.csv')

In [4]:
# look frist five rows 
tadawul_stcks.head()

Unnamed: 0,trading_name,sector,date,open_price,high_price,low_price,close_price,change,perc_Change,volume_traded,value_traded,num_trades,Year,month,day,Change_category
0,SARCO,Energy,2020-03-05,35.55,35.85,34.9,34.9,-0.4,-1.13,436609.0,15399073.5,804.0,2020,March,Thursday,Bad Change
1,SARCO,Energy,2020-03-04,34.7,35.65,34.5,35.3,0.25,0.71,737624.0,25981391.35,1268.0,2020,March,Wednesday,Good Change
2,SARCO,Energy,2020-03-03,34.7,35.15,34.7,35.05,1.05,3.09,489831.0,17116413.4,854.0,2020,March,Tuesday,Good Change
3,SARCO,Energy,2020-03-02,35.2,35.65,34.0,34.0,-0.55,-1.59,736157.0,25858700.6,1242.0,2020,March,Monday,Bad Change
4,SARCO,Energy,2020-03-01,35.35,35.6,34.25,34.55,-2.05,-5.6,738685.0,25747967.55,1625.0,2020,March,Sunday,Bad Change


# ML model

## Select the feature and target

In [5]:
target = 'close_price' # Target Varible
features = ['open_price','low_price','change']


X = tadawul_stcks[features]
y = tadawul_stcks[target]

## Standard Scaler

In [6]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(X)

## Split the data

In [7]:
# split data into train and test 
# select random state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Baseline model

In [10]:
# we will use the DummyRegressor model for the basline
baseline_model = DummyRegressor()
baseline_model.fit(X_train,y_train)
baseline_model_pred =  baseline_model.predict(X_test)

print(f"baseline model score: {r2_score(y_test, baseline_model_pred)}")

baseline model score: -1.33849496373184e-05


# Train Model

### 1st Liner Regression Model 

In [11]:
#create the model
liner = LinearRegression()
# fit the model using X train and y train
liner.fit(X_train , y_train)
# using X test to make our predication
linear_pred = liner.predict(X_test)

### Train and Test score

In [15]:
print(f"Test score: {liner.score(X_test, y_test)}")
print(f"Train score: {liner.score(X_train, y_train)}")

Test score: 0.999605900045618
Train score: 0.9993704661079523


# --------------------------------------------------------------------------------------

### 2nd Random Forest regression

In [12]:
# create the random model
# set the num of estimators = 10
# max depth is 4 
random_reg = RandomForestRegressor(n_estimators = 10, max_depth = 4, criterion = 'mse')
# fit the model using X train and Y train
random_reg.fit(X_train ,y_train)
# make our predication using X test
random_predict =random_reg.predict(X_test)


  warn(


### Train and Test Score

In [17]:
print(f"Test score: {random_reg.score(X_test, y_test)}")
print(f"Train score: {random_reg.score(X_train, y_train)}")

Test score: 0.9877978808475945
Train score: 0.9880452299578147


# ---------------------------------------------------------------------------------------

### 3rd KNN  Regression model

In [15]:
# Creat the KNN model
knn_reg = KNeighborsRegressor()
# use X train and y train
knn_reg.fit(X_train,y_train)
# make our predication using X test
preds_knn_reg = knn_reg.predict(X_test)


### Train and Test Score

In [22]:
print(f"Test score: {knn_reg.score(X_test, y_test)}")
print(f"Train score: {knn_reg.score(X_train, y_train)}")

Test score: 0.9995150671521293
Train score: 0.9992696578602894


# ----------------------------------------------------------------------------------------

### 4th GBR  Regression model

In [16]:
gbr_reg = GradientBoostingRegressor()
# fit the model
gbr_reg.fit(X_train,y_train)
# make our predication using X test
preds_gbr_reg = gbr_reg.predict(X_test)

## Train and test score

In [24]:
print(f"Test score: {gbr_reg.score(X_test, y_test)}")
print(f"Train score: {gbr_reg.score(X_train, y_train)}")

Test score: 0.9992229481338691
Train score: 0.999618349600064


# -----------------------------------------------------------------------------------------

### 5th XGB Regression Model

In [17]:
# create the model
xgb_reg = xgb.XGBRegressor(objective = "reg:linear",
                           n_estimators = 75,
                           subsample = 0.75,
                           max_depth = 7)
# fit the model using X train , y train
xgb_reg.fit(X_train , y_train)
#make our predication using X test
xgb_pred =xgb_reg.predict(X_test)



In [26]:
print(f"Test score: {xgb_reg.score(X_test, y_test)}")
print(f"Train score: {xgb_reg.score(X_train, y_train)}")

Test score: 0.9991932701703619
Train score: 0.9999117619114631


# ----------------------------------------------------------------------------------------

# Model Evaluation



### Create funaction for Cost Funcation

####  This include  MSE, MAE,RMSE, R2  

In [18]:
# Create cost function that display all the cost functions for the regression models
def cost_function(pred):
    Adj_r2 = 1 - (1-r2_score(y_test, pred)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    print("R Squared:",r2_score(y_test, pred))
    print("MSE:",mean_squared_error(y_test, pred))
    print("MAE:",mean_absolute_error(y_test, pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, pred)))
    print("Adjusted R Squared:",Adj_r2)
    

###  1st Model  Evaluation - Liner Regression Model

In [29]:
## Call the funcation
cost_function(linear_pred)

R Squared: 0.999605900045618
MSE: 2.844278557176044
MAE: 0.4264164928631083
RMSE: 1.6864989051808021
Adjusted R Squared: 0.9996058918836075


#### The output:
#### the accuracy of this model is 99 % which is high and better than the basline accuracy
#### MSE ,  MAE  and RMSE error it is very low 


###  2nd Model  Evaluation - Random Forest Model

In [30]:
## Call the funcation
cost_function(random_predict)

R Squared: 0.9877978808475945
MSE: 88.06452645172568
MAE: 5.740867009597165
RMSE: 9.384270160844991
Adjusted R Squared: 0.9877976281355019


#### the output:
#### the acuraccy of Random forest is 98 which is better than the basline Model and less than the liner regression model. 
#### MSE , MAE and RMSE are high than logstic regression with value 84 and the 5.1 , 9.3

###  3rd Model  Evaluation -KNN Model

In [32]:
## Call the funcation
cost_function(preds_knn_reg)


R Squared: 0.9995150671521293
MSE: 3.4998331908765845
MAE: 0.30185866158582897
RMSE: 1.8707841112422845
Adjusted R Squared: 0.9995150571089235


#### The output :
#### The accuracy of KNN model is 99 % which is good accuracy and better than the basline model and Random forest. Also , MSE ,  MAE and RMSE errors are lower than the random forcat .

###  4th Model  Evaluation -GBR Model

In [33]:
## Call the funcation
cost_function(preds_gbr_reg)

R Squared: 0.9992229481338691
MSE: 5.608100016443012
MAE: 0.6715781578684098
RMSE: 2.368142735656576
Adjusted R Squared: 0.9992229320407299


#### The output:
#### The accuracy score of GBR model is 99 which is better than the basline and same KNN  and liner regression model 

###  5th Model  Evaluation -XGB Model

In [34]:
## Call the funcation
cost_function(xgb_pred)


R Squared: 0.9991932701703619
MSE: 5.822290336146822
MAE: 0.3990513375335785
RMSE: 2.412942257109942
Adjusted R Squared: 0.9991932534625768


#### The accuracy score of XGB model is 99 which is better than the basline model and same all model , also the MSE , MAE and RSME error low 

##  Summary Result

In [31]:
df_perf_metrics = pd.DataFrame(columns=['Model','R2','MSE', 'MAE', 'RMSE'])
model_name = ['Baseline model', 'Linear Regression',  'Random Forest','KNN' , 'GBR' , 'XGB']
model_pred = [baseline_model_pred, linear_pred, random_predict, preds_knn_reg ,preds_gbr_reg ,xgb_pred]

def get_perf_metrics(model, n):
    Adj_r2 = 1 - (1-r2_score(y_test, model_pred[n])) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    
    df_perf_metrics.loc[n] = [
        model_name[n],
        r2_score(y_test, model_pred[n]),
        mean_squared_error(y_test, model_pred[n]),
        mean_absolute_error(y_test, model_pred[n]),
        np.sqrt(mean_squared_error(y_test, model_pred[n])),
       # Adj_r2
    ]
        
                
for n, model in enumerate(model_name):
    get_perf_metrics(model, n)

In [32]:
df_perf_metrics

Unnamed: 0,Model,R2,MSE,MAE,RMSE
0,Baseline model,-1.3e-05,7217.246782,28.401061,84.954381
1,Linear Regression,0.999606,2.844279,0.426416,1.686499
2,Random Forest,0.987333,91.422137,5.789177,9.561492
3,KNN,0.999515,3.501239,0.301051,1.87116
4,GBR,0.999223,5.608102,0.671724,2.368143
5,XGB,0.999213,5.676292,0.396874,2.382497


# ---------------------------------------------------------------------------------------

# Model Selection 
#### After we created five regression models ( Liner, Random Forest, KNN, XGP, GBR ),  all these model values are positive.So, the regression model is better than the Baseline model
#### We selected  Logstic regression Model because give as the best Accuracy and  Error
#### R Squared: 0.999605900045618
#### MSE: 2.8442785571761524
#### MAE: 0.4264164928631798
#### RMSE: 1.6864989051808341

## Model Optimization - Hyperparameter Tuning 

## Liner Regression Model - Best Model

In [22]:
param_grid_linear = {"fit_intercept": [True, False],
                     
                    }

grid_linear_reg = GridSearchCV(estimator=liner,
                               param_grid=param_grid_linear,
                               scoring='r2',
                               verbose=1,
                               n_jobs=-1)

# fit the model
grid_linear_reg.fit(X_train, y_train)
# make the predication
grid=grid_linear_reg.predict(X_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


### Train and Test Score

In [197]:
print(f"Test score: {grid_linear_reg.score(X_test, y_test)}")
print(f"Train score: {grid_linear_reg.score(X_train, y_train)}")

Test score: 0.9995051650270971
Train score: 0.9993856129670836


In [23]:
## Call the funcation
cost_function(grid)

R Squared: 0.999605900045618
MSE: 2.8442785571761524
MAE: 0.4264164928631798
RMSE: 1.6864989051808341
Adjusted R Squared: 0.9996058918836075


#### the accuracy score and the MSE , MAE , RMSE same as Logstic regression nothing change

# ML Pipeline for Best Model - Liner Regression

In [26]:
numeric_features = X_train.describe().columns # Select the numrical feature



#Create Transformer for numerical data
numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('scaler', StandardScaler())
    ]
)



# Create a preprocessor transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        
    ]
)

## Create the pipline

In [27]:
liner_reg_pp = Pipeline(
    steps=[
        ('preprocessor', preprocessor),# set the preprocessor
        
        ('reg',LinearRegression())# Create the Liner regression model
    ]
)

## Fit the model

In [28]:
liner_reg_pp.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['open_price', 'low_price', 'change'], dtype='object'))])),
                ('reg', LinearRegression())])

In [29]:
# The predication X test feature
pipline_predict=liner_reg_pp.predict(X_test)

### Train and Test score

In [26]:
print(f"Test score: {liner_reg_pp.score(X_test, y_test)}")
print(f"Train score: {liner_reg_pp.score(X_train, y_train)}")

Test score: 0.9995051650270971
Train score: 0.9993856129670836


### Model Evaluation

In [30]:
## Call the funcation
cost_function(pipline_predict)

R Squared: 0.999605900045618
MSE: 2.8442785571761515
MAE: 0.4264164928631796
RMSE: 1.686498905180834
Adjusted R Squared: 0.9996058918836075
