# Importing Modules

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

import matplotlib.pyplot as plt

# Importing Dataset

In [2]:
df = pd.read_excel(r'/Users/mac/Downloads/Education.xlsx')
df.head()
                

Unnamed: 0,Year,Expenditure on tertiary education (% of government expenditure on education),"School enrollment, tertiary (% gross)","Literacy rate, adult total (% of people ages 15 and above)","Tertiary education, academic staff (% female)"
0,1985,14.08417,29.591089,54.810108,30.278391
1,1986,15.09756,29.247841,55.440521,29.95422
2,1987,14.99714,31.64455,56.325809,30.16885
3,1988,15.85419,31.783649,57.064659,29.59181
4,1989,15.80763,33.770649,57.749352,30.018721


In [3]:
df.tail(2)

Unnamed: 0,Year,Expenditure on tertiary education (% of government expenditure on education),"School enrollment, tertiary (% gross)","Literacy rate, adult total (% of people ages 15 and above)","Tertiary education, academic staff (% female)"
35,2020,25.66675,87.168533,79.961388,39.95805
36,2021,25.43258,93.924301,80.358002,40.71479


# Data preparation

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 5 columns):
 #   Column                                                                        Non-Null Count  Dtype  
---  ------                                                                        --------------  -----  
 0   Year                                                                          37 non-null     int64  
 1   Expenditure on tertiary education (% of government expenditure on education)  37 non-null     float64
 2   School enrollment, tertiary (% gross)                                         37 non-null     float64
 3   Literacy rate, adult total (% of people ages 15 and above)                    37 non-null     float64
 4   Tertiary education, academic staff (% female)                                 37 non-null     float64
dtypes: float64(4), int64(1)
memory usage: 1.6 KB


In [16]:
df.isnull().sum()

Year                                                                            0
Expenditure on tertiary education (% of government expenditure on education)    0
School enrollment, tertiary (% gross)                                           0
Literacy rate, adult total (% of people ages 15 and above)                      0
Tertiary education, academic staff (% female)                                   0
dtype: int64

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,37.0,2003.0,10.824355,1985.0,1994.0,2003.0,2012.0,2021.0
Expenditure on tertiary education (% of government expenditure on education),37.0,25.847804,5.13447,14.08417,25.66675,26.42908,29.443033,32.36272
"School enrollment, tertiary (% gross)",37.0,65.229347,18.473569,29.247841,54.690849,71.704758,78.239738,93.924301
"Literacy rate, adult total (% of people ages 15 and above)",37.0,68.887666,8.111143,54.810108,61.707691,70.409912,75.882919,80.358002
"Tertiary education, academic staff (% female)",37.0,33.180541,3.570889,28.25527,30.162519,32.22459,35.542068,40.71479


In [4]:
X = df.drop(columns=['Literacy rate, adult total (% of people ages 15 and above)'])
y = df['Literacy rate, adult total (% of people ages 15 and above)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fitting Random Forest Model at different parameter settings

In [5]:
# Define a function to fit the model and get results
def fit_and_evaluate_model(params, X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred
    })
    return model, results, r2

# Initial parameters
initial_params = {
    'n_estimators': 100,
    'random_state': 0
}

# Fit the initial model
initial_model, initial_results, initial_r2 = fit_and_evaluate_model(initial_params, X_train, X_test, y_train, y_test)

# Define different sets of parameters
params_list = [
    {'n_estimators': 150, 'random_state': 0},
    {'n_estimators': 100, 'max_depth': 10, 'random_state': 0},
    {'n_estimators': 200, 'min_samples_split': 5, 'random_state': 0}
]

models = [initial_model]
results = [initial_results]
r2_scores = [initial_r2]
param_names = ["Initial Parameters", "150 Estimators", "Max Depth 10", "200 Estimators, Min Samples Split 5"]

for params in params_list:
    model, result, r2 = fit_and_evaluate_model(params, X_train, X_test, y_train, y_test)
    models.append(model)
    results.append(result)
    r2_scores.append(r2)
    
results_table = pd.DataFrame({
    'Model': param_names,
    'R-Squared': r2_scores
})

print("Evaluation Metrics (R-Squared) for each model:")
print(results_table)

for name, result in zip(param_names, results):
    print(f"\nResults for {name}:")
    print(result.head())

Evaluation Metrics (R-Squared) for each model:
                                 Model  R-Squared
0                   Initial Parameters   0.989136
1                       150 Estimators   0.985544
2                         Max Depth 10   0.989136
3  200 Estimators, Min Samples Split 5   0.968317

Results for Initial Parameters:
       Actual  Predicted
22  72.045631  72.008073
20  70.927567  71.246886
16  68.670097  69.375714
10  62.518120  61.291011
31  78.606552  78.729469

Results for 150 Estimators:
       Actual  Predicted
22  72.045631  71.936082
20  70.927567  71.260226
16  68.670097  69.710327
10  62.518120  61.315776
31  78.606552  78.707593

Results for Max Depth 10:
       Actual  Predicted
22  72.045631  72.008073
20  70.927567  71.246886
16  68.670097  69.375714
10  62.518120  61.291011
31  78.606552  78.729469

Results for 200 Estimators, Min Samples Split 5:
       Actual  Predicted
22  72.045631  71.883945
20  70.927567  71.185421
16  68.670097  70.497423
10  62.518120 

# Fitting other models to compare with random forest

In [16]:
models = {
    'Linear Regression': LinearRegression(),

    'Decision Tree': DecisionTreeRegressor(),
    
}

mse_results = {}
rmse_results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

In [18]:
r_squared_results = {}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    r_squared = r2_score(y_test, y_pred)
    r_squared_results[model_name] = r_squared

print("R-squared for each model:")
for model, r_squared in r_squared_results.items():
    print(f"{model}: {r_squared}")

R-squared for each model:
Linear Regression: 0.9808966985807995
Decision Tree: 0.976521383788051


# Fitting the default model again and Saving it

In [20]:
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_train, y_train)

In [21]:
from joblib import dump
dump(random_forest_model, 'random_forest_model.joblib')

['random_forest_model.joblib']

# Deploying the model through streamlit

In [56]:
import streamlit as st
import joblib
import numpy as np

model = joblib.load(r"/Users/mac/Desktop/ipynb files/random_forest_model.joblib")
st.title("Literacy Rate Prediction")
year = st.number_input("Year", min_value=1900, max_value=2100, value=2020)
expenditure_on_tertiary_education = st.number_input("Expenditure on Tertiary Education (% of government expenditure on education)", min_value=0.0, max_value=100.0, value=10.0)
school_enrollment_tertiary = st.number_input("School Enrollment, Tertiary (% gross)", min_value=0.0, max_value=100.0, value=50.0)
academic_staff_female = st.number_input("Tertiary Education, Academic Staff (% female)", min_value=0.0, max_value=100.0, value=50.0)


if st.button("Predict Literacy Rate"):
  
    input_data = np.array([[year, expenditure_on_tertiary_education, school_enrollment_tertiary, academic_staff_female]])

   
    literacy_rate_prediction = model.predict(input_data)

   
    st.write(f"Predicted Literacy Rate (adult total % of people ages 15 and above): {literacy_rate_prediction[0]:.2f}%")


2024-06-12 21:57:24.547 
  command:

    streamlit run /opt/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
