# Malaria data set for the last 12 months of 2024 for malaria cases in Uganda from the ministry of Health

In [1]:
## you will need the following libraries, install all required libraries

# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn
# !pip install catboost
# !pip install lightgbm
# !pip install xgboost

In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from pandas import read_excel 
from IPython.display import display
%matplotlib inline

In [3]:
# read excell data set for malaria data in Uganda
data = read_excel('/Users/macbook/Documents/GITHUB/malaria/Malaria_prediction/data_malaria.xls')

In [4]:
data.head()

Unnamed: 0,period name,033B-AP01. OPD New,033B-AP02. Total OPD,033B-AP03. Expected eMTCT Mothers in Appt,033B-AP04. eMTCT Missed Appointments,033B-CD01a. Malaria (diagnosed) - Cases,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD02b. Dysentery - Deaths,033B-CD02d. Dysentery - Cases Positive,...,033B-CD11a. Neonatal tetanus - Cases,033B-CD11b. Neonatal tetanus- Deaths,033B-CD12a. Plague - Cases,033B-CD12b. Plague- Deaths,033B-CD13a. Typhoid Fever - Cases,033B-CD13b. Typhoid Fever- Deaths,033B-CD13c. Typhoid Fever- Cases Tested,033B-CD13d. Typhoid Fever- Cases Positive,033B-CD14a. Hepatitis B - Cases,033B-CD14b. Hepatitis B- Deaths
0,May 2023,2808536.0,3044521.0,41493.0,7618.0,914494.0,306.0,791.0,3.0,1402.0,...,,8.0,,,8288.0,53.0,22.0,3.0,791.0,32.0
1,June 2023,3267076.0,3433374.0,46461.0,8072.0,1169201.0,685.0,879.0,4.0,823.0,...,5.0,,,,9091.0,22.0,,,591.0,13.0
2,July 2023,2626526.0,2852837.0,37744.0,7922.0,899466.0,192.0,760.0,11.0,1257.0,...,1.0,1.0,,,7687.0,9.0,14.0,4.0,573.0,
3,August 2023,3019946.0,3282068.0,46564.0,11898.0,939092.0,227.0,1321.0,17.0,744.0,...,15.0,8.0,34.0,16.0,9173.0,18.0,34.0,14.0,793.0,6.0
4,September 2023,2147468.0,2336231.0,36550.0,6246.0,873292.0,297.0,921.0,1.0,763.0,...,2.0,8.0,,8.0,7260.0,22.0,25.0,7.0,767.0,1.0


In [5]:
# randomly shuffle the data 
data = data.sample(frac=1)

In [6]:
# Converting the period column to proper date format. 
# Assuming df is your DataFrame and 'period name' is a datetime column
data['period name'] = pd.to_datetime(data['period name'])

# Extracting month and year from the period
data['month'] = data['period name'].dt.month
data['year'] = data['period name'].dt.year

# Creating lagged variables (e.g., previous month's malaria cases)
data['lag_1_month_malaria_cases'] = data['033B-CD01a. Malaria (diagnosed)  - Cases'].shift(1)

# Dropping the first row because it will have NaN after shifting
data = data.dropna(axis=1)

# Dropping the original period column as we have extracted useful features from it
data = data.drop(columns=['period name'])

# Now your dataframe includes month, year, and lagged malaria cases which can be used in prediction
print(data.head())


   033B-AP01. OPD New  033B-AP02. Total OPD  \
7           2078021.0             2144334.0   
4           2147468.0             2336231.0   
6           3049091.0             3286098.0   
3           3019946.0             3282068.0   
1           3267076.0             3433374.0   

   033B-AP03. Expected eMTCT Mothers in Appt  \
7                                    27806.0   
4                                    36550.0   
6                                    49076.0   
3                                    46564.0   
1                                    46461.0   

   033B-AP04. eMTCT Missed Appointments  \
7                                5350.0   
4                                6246.0   
6                                8380.0   
3                               11898.0   
1                                8072.0   

   033B-CD01a. Malaria (diagnosed)  - Cases  \
7                                  647512.0   
4                                  873292.0   
6                           

  data['period name'] = pd.to_datetime(data['period name'])


In [7]:
data.head()

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-AP03. Expected eMTCT Mothers in Appt,033B-AP04. eMTCT Missed Appointments,033B-CD01a. Malaria (diagnosed) - Cases,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD02d. Dysentery - Cases Positive,033B-CD03a. SARI - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD06a. Animal bites(Suspected rabies) - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13a. Typhoid Fever - Cases,033B-CD13b. Typhoid Fever- Deaths,033B-CD14a. Hepatitis B - Cases,month,year
7,2078021.0,2144334.0,27806.0,5350.0,647512.0,402.0,1013.0,612.0,1283.0,26.0,13.0,1270.0,52.0,241.0,5120.0,33.0,529.0,12,2023
4,2147468.0,2336231.0,36550.0,6246.0,873292.0,297.0,921.0,763.0,386.0,46.0,19.0,1376.0,29.0,248.0,7260.0,22.0,767.0,9,2023
6,3049091.0,3286098.0,49076.0,8380.0,995554.0,514.0,1583.0,1089.0,1216.0,39.0,32.0,1848.0,39.0,1083.0,8324.0,29.0,711.0,11,2023
3,3019946.0,3282068.0,46564.0,11898.0,939092.0,227.0,1321.0,744.0,801.0,40.0,26.0,1639.0,30.0,587.0,9173.0,18.0,793.0,8,2023
1,3267076.0,3433374.0,46461.0,8072.0,1169201.0,685.0,879.0,823.0,743.0,24.0,33.0,1621.0,37.0,601.0,9091.0,22.0,591.0,6,2023


In [8]:
df_size = int(0.7 * len(data))  # 70% of the dataset
validattion_data_size = len(data) - df_size  # Remaining 30% of the dataset

In [9]:
# Split the dataset into two portions
df = data[:df_size]
val_data = data[validattion_data_size:]

In [10]:
# Check the sizes of the two portions
print("Dataframe size:", len(df))
print("validation size:", len(val_data))

Dataframe size: 8
validation size: 8


In [11]:
# lets view how oue dataframe lookslike
display(df)

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-AP03. Expected eMTCT Mothers in Appt,033B-AP04. eMTCT Missed Appointments,033B-CD01a. Malaria (diagnosed) - Cases,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD02d. Dysentery - Cases Positive,033B-CD03a. SARI - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD06a. Animal bites(Suspected rabies) - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13a. Typhoid Fever - Cases,033B-CD13b. Typhoid Fever- Deaths,033B-CD14a. Hepatitis B - Cases,month,year
7,2078021.0,2144334.0,27806.0,5350.0,647512.0,402.0,1013.0,612.0,1283.0,26.0,13.0,1270.0,52.0,241.0,5120.0,33.0,529.0,12,2023
4,2147468.0,2336231.0,36550.0,6246.0,873292.0,297.0,921.0,763.0,386.0,46.0,19.0,1376.0,29.0,248.0,7260.0,22.0,767.0,9,2023
6,3049091.0,3286098.0,49076.0,8380.0,995554.0,514.0,1583.0,1089.0,1216.0,39.0,32.0,1848.0,39.0,1083.0,8324.0,29.0,711.0,11,2023
3,3019946.0,3282068.0,46564.0,11898.0,939092.0,227.0,1321.0,744.0,801.0,40.0,26.0,1639.0,30.0,587.0,9173.0,18.0,793.0,8,2023
1,3267076.0,3433374.0,46461.0,8072.0,1169201.0,685.0,879.0,823.0,743.0,24.0,33.0,1621.0,37.0,601.0,9091.0,22.0,591.0,6,2023
8,2461891.0,2726887.0,38542.0,7338.0,729655.0,278.0,1063.0,1001.0,1453.0,38.0,32.0,1354.0,62.0,678.0,8137.0,5.0,774.0,1,2024
10,803228968.0,2679901.0,37386.0,7095.0,616163.0,150.0,1481.0,530.0,931.0,33.0,98.0,1123.0,38.0,560.0,6178.0,3.0,721.0,3,2024
11,2458392.0,2459682.0,36386.0,6879.0,479413.0,233.0,1462.0,374.0,669.0,22.0,58.0,1355.0,35.0,395.0,5738.0,7.0,708.0,4,2024


In [12]:
# lets view the shape of our data ie Rows and Columns
df.shape

(8, 19)

In [13]:
# lets view our columns in the dataset
display(df.columns)

Index(['033B-AP01. OPD New', '033B-AP02. Total OPD',
       '033B-AP03. Expected eMTCT Mothers in Appt',
       '033B-AP04. eMTCT Missed Appointments',
       '033B-CD01a. Malaria (diagnosed)  - Cases',
       '033B-CD01b. Malaria (diagnosed) - Deaths',
       '033B-CD02a. Dysentery  - Cases',
       '033B-CD02d. Dysentery - Cases Positive', '033B-CD03a. SARI - Cases',
       '033B-CD04a. Acute Flaccid Paralysis - Cases',
       '033B-CD05a. AEFI - Cases',
       '033B-CD06a. Animal bites(Suspected rabies) - Cases',
       '033B-CD07a. Bacterial Meningitis  - Cases',
       '033B-CD10a. Measles - Cases', '033B-CD13a. Typhoid Fever - Cases',
       '033B-CD13b. Typhoid Fever- Deaths', '033B-CD14a. Hepatitis B - Cases',
       'month', 'year'],
      dtype='object')

In [14]:
# understanding the data type for each column in our dataset
df.dtypes

033B-AP01. OPD New                                    float64
033B-AP02. Total OPD                                  float64
033B-AP03. Expected eMTCT Mothers in Appt             float64
033B-AP04. eMTCT Missed Appointments                  float64
033B-CD01a. Malaria (diagnosed)  - Cases              float64
033B-CD01b. Malaria (diagnosed) - Deaths              float64
033B-CD02a. Dysentery  - Cases                        float64
033B-CD02d. Dysentery - Cases Positive                float64
033B-CD03a. SARI - Cases                              float64
033B-CD04a. Acute Flaccid Paralysis - Cases           float64
033B-CD05a. AEFI - Cases                              float64
033B-CD06a. Animal bites(Suspected rabies) - Cases    float64
033B-CD07a. Bacterial Meningitis  - Cases             float64
033B-CD10a. Measles - Cases                           float64
033B-CD13a. Typhoid Fever - Cases                     float64
033B-CD13b. Typhoid Fever- Deaths                     float64
033B-CD1

In [15]:
# lets see if we have any categorical columns in the dataset
object_cols = df.select_dtypes('object')
object_cols

7
4
6
3
1
8
10
11


In [16]:
# how many unique values do we have in the categorical columns
#object_cols['period name'].nunique()

## Exploratory Data Analysis

In [17]:
display(df.shape)

(8, 19)

In [18]:
# check for null values
df.isna().sum()

033B-AP01. OPD New                                    0
033B-AP02. Total OPD                                  0
033B-AP03. Expected eMTCT Mothers in Appt             0
033B-AP04. eMTCT Missed Appointments                  0
033B-CD01a. Malaria (diagnosed)  - Cases              0
033B-CD01b. Malaria (diagnosed) - Deaths              0
033B-CD02a. Dysentery  - Cases                        0
033B-CD02d. Dysentery - Cases Positive                0
033B-CD03a. SARI - Cases                              0
033B-CD04a. Acute Flaccid Paralysis - Cases           0
033B-CD05a. AEFI - Cases                              0
033B-CD06a. Animal bites(Suspected rabies) - Cases    0
033B-CD07a. Bacterial Meningitis  - Cases             0
033B-CD10a. Measles - Cases                           0
033B-CD13a. Typhoid Fever - Cases                     0
033B-CD13b. Typhoid Fever- Deaths                     0
033B-CD14a. Hepatitis B - Cases                       0
month                                           

In [19]:
# check duplicates
df.duplicated().sum()

0

In [20]:
# drop null values
# df.drop('033B-CD09b. Guinea Worm- Deaths', axis=1, inplace=True)
# df.isna().sum()

In [21]:
df.fillna(df['033B-CD01a. Malaria (diagnosed)  - Cases'].mean(), axis=0, inplace=True)
df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(df['033B-CD01a. Malaria (diagnosed)  - Cases'].mean(), axis=0, inplace=True)


033B-AP01. OPD New                                    0
033B-AP02. Total OPD                                  0
033B-AP03. Expected eMTCT Mothers in Appt             0
033B-AP04. eMTCT Missed Appointments                  0
033B-CD01a. Malaria (diagnosed)  - Cases              0
033B-CD01b. Malaria (diagnosed) - Deaths              0
033B-CD02a. Dysentery  - Cases                        0
033B-CD02d. Dysentery - Cases Positive                0
033B-CD03a. SARI - Cases                              0
033B-CD04a. Acute Flaccid Paralysis - Cases           0
033B-CD05a. AEFI - Cases                              0
033B-CD06a. Animal bites(Suspected rabies) - Cases    0
033B-CD07a. Bacterial Meningitis  - Cases             0
033B-CD10a. Measles - Cases                           0
033B-CD13a. Typhoid Fever - Cases                     0
033B-CD13b. Typhoid Fever- Deaths                     0
033B-CD14a. Hepatitis B - Cases                       0
month                                           

In [22]:
display(df)

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-AP03. Expected eMTCT Mothers in Appt,033B-AP04. eMTCT Missed Appointments,033B-CD01a. Malaria (diagnosed) - Cases,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD02d. Dysentery - Cases Positive,033B-CD03a. SARI - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD06a. Animal bites(Suspected rabies) - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13a. Typhoid Fever - Cases,033B-CD13b. Typhoid Fever- Deaths,033B-CD14a. Hepatitis B - Cases,month,year
7,2078021.0,2144334.0,27806.0,5350.0,647512.0,402.0,1013.0,612.0,1283.0,26.0,13.0,1270.0,52.0,241.0,5120.0,33.0,529.0,12,2023
4,2147468.0,2336231.0,36550.0,6246.0,873292.0,297.0,921.0,763.0,386.0,46.0,19.0,1376.0,29.0,248.0,7260.0,22.0,767.0,9,2023
6,3049091.0,3286098.0,49076.0,8380.0,995554.0,514.0,1583.0,1089.0,1216.0,39.0,32.0,1848.0,39.0,1083.0,8324.0,29.0,711.0,11,2023
3,3019946.0,3282068.0,46564.0,11898.0,939092.0,227.0,1321.0,744.0,801.0,40.0,26.0,1639.0,30.0,587.0,9173.0,18.0,793.0,8,2023
1,3267076.0,3433374.0,46461.0,8072.0,1169201.0,685.0,879.0,823.0,743.0,24.0,33.0,1621.0,37.0,601.0,9091.0,22.0,591.0,6,2023
8,2461891.0,2726887.0,38542.0,7338.0,729655.0,278.0,1063.0,1001.0,1453.0,38.0,32.0,1354.0,62.0,678.0,8137.0,5.0,774.0,1,2024
10,803228968.0,2679901.0,37386.0,7095.0,616163.0,150.0,1481.0,530.0,931.0,33.0,98.0,1123.0,38.0,560.0,6178.0,3.0,721.0,3,2024
11,2458392.0,2459682.0,36386.0,6879.0,479413.0,233.0,1462.0,374.0,669.0,22.0,58.0,1355.0,35.0,395.0,5738.0,7.0,708.0,4,2024


In [23]:
df.describe()

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-AP03. Expected eMTCT Mothers in Appt,033B-AP04. eMTCT Missed Appointments,033B-CD01a. Malaria (diagnosed) - Cases,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD02d. Dysentery - Cases Positive,033B-CD03a. SARI - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD06a. Animal bites(Suspected rabies) - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13a. Typhoid Fever - Cases,033B-CD13b. Typhoid Fever- Deaths,033B-CD14a. Hepatitis B - Cases,month,year
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,102713900.0,2793572.0,39846.375,7657.25,806235.2,348.25,1215.375,742.0,935.25,33.5,38.875,1448.25,40.25,549.125,7377.625,17.375,699.25,6.75,2023.375
std,283051200.0,485515.6,7067.344216,1964.582598,227950.8,176.720724,278.173603,236.571946,357.647372,8.684962,27.310058,234.943002,11.259916,271.305541,1552.393446,11.274972,92.857418,3.918819,0.517549
min,2078021.0,2144334.0,27806.0,5350.0,479413.0,150.0,879.0,374.0,386.0,22.0,13.0,1123.0,29.0,241.0,5120.0,3.0,529.0,1.0,2023.0
25%,2380661.0,2428819.0,36509.0,6720.75,639674.8,231.5,990.0,591.5,724.5,25.5,24.25,1333.0,33.75,358.25,6068.0,6.5,678.75,3.75,2023.0
50%,2740918.0,2703394.0,37964.0,7216.5,801473.5,287.5,1192.0,753.5,866.0,35.5,32.0,1365.5,37.5,573.5,7698.5,20.0,716.0,7.0,2023.0
75%,3103587.0,3283076.0,46486.75,8149.0,953207.5,430.0,1466.75,867.5,1232.75,39.25,39.25,1625.5,42.25,620.25,8515.75,23.75,768.75,9.5,2024.0
max,803229000.0,3433374.0,49076.0,11898.0,1169201.0,685.0,1583.0,1089.0,1453.0,46.0,98.0,1848.0,62.0,1083.0,9173.0,33.0,793.0,12.0,2024.0


In [24]:
numerical_columns = df.select_dtypes(exclude='object')
numerical_columns

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-AP03. Expected eMTCT Mothers in Appt,033B-AP04. eMTCT Missed Appointments,033B-CD01a. Malaria (diagnosed) - Cases,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD02d. Dysentery - Cases Positive,033B-CD03a. SARI - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD06a. Animal bites(Suspected rabies) - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13a. Typhoid Fever - Cases,033B-CD13b. Typhoid Fever- Deaths,033B-CD14a. Hepatitis B - Cases,month,year
7,2078021.0,2144334.0,27806.0,5350.0,647512.0,402.0,1013.0,612.0,1283.0,26.0,13.0,1270.0,52.0,241.0,5120.0,33.0,529.0,12,2023
4,2147468.0,2336231.0,36550.0,6246.0,873292.0,297.0,921.0,763.0,386.0,46.0,19.0,1376.0,29.0,248.0,7260.0,22.0,767.0,9,2023
6,3049091.0,3286098.0,49076.0,8380.0,995554.0,514.0,1583.0,1089.0,1216.0,39.0,32.0,1848.0,39.0,1083.0,8324.0,29.0,711.0,11,2023
3,3019946.0,3282068.0,46564.0,11898.0,939092.0,227.0,1321.0,744.0,801.0,40.0,26.0,1639.0,30.0,587.0,9173.0,18.0,793.0,8,2023
1,3267076.0,3433374.0,46461.0,8072.0,1169201.0,685.0,879.0,823.0,743.0,24.0,33.0,1621.0,37.0,601.0,9091.0,22.0,591.0,6,2023
8,2461891.0,2726887.0,38542.0,7338.0,729655.0,278.0,1063.0,1001.0,1453.0,38.0,32.0,1354.0,62.0,678.0,8137.0,5.0,774.0,1,2024
10,803228968.0,2679901.0,37386.0,7095.0,616163.0,150.0,1481.0,530.0,931.0,33.0,98.0,1123.0,38.0,560.0,6178.0,3.0,721.0,3,2024
11,2458392.0,2459682.0,36386.0,6879.0,479413.0,233.0,1462.0,374.0,669.0,22.0,58.0,1355.0,35.0,395.0,5738.0,7.0,708.0,4,2024


In [25]:
cor = numerical_columns.corr()
cor_feature = cor['033B-CD01b. Malaria (diagnosed) - Deaths'].sort_values(ascending=False).to_frame()
style = cor_feature.style.background_gradient(cmap='inferno')
style

Unnamed: 0,033B-CD01b. Malaria (diagnosed) - Deaths
033B-CD01b. Malaria (diagnosed) - Deaths,1.0
033B-CD01a. Malaria (diagnosed) - Cases,0.731453
033B-CD13b. Typhoid Fever- Deaths,0.639982
033B-CD06a. Animal bites(Suspected rabies) - Cases,0.609956
033B-CD02d. Dysentery - Cases Positive,0.487722
033B-AP02. Total OPD,0.486347
033B-CD13a. Typhoid Fever - Cases,0.40319
month,0.398432
033B-AP03. Expected eMTCT Mothers in Appt,0.393789
033B-CD10a. Measles - Cases,0.318192


In [26]:
cor_cases = df.corr()
correlation_feature = cor_cases['033B-CD01a. Malaria (diagnosed)  - Cases'].sort_values(ascending=False).to_frame()
style = correlation_feature.style.background_gradient(cmap='twilight')
style

Unnamed: 0,033B-CD01a. Malaria (diagnosed) - Cases
033B-CD01a. Malaria (diagnosed) - Cases,1.0
033B-CD13a. Typhoid Fever - Cases,0.846062
033B-AP02. Total OPD,0.772654
033B-CD06a. Animal bites(Suspected rabies) - Cases,0.764728
033B-AP03. Expected eMTCT Mothers in Appt,0.747466
033B-CD01b. Malaria (diagnosed) - Deaths,0.731453
033B-CD02d. Dysentery - Cases Positive,0.68382
033B-CD13b. Typhoid Fever- Deaths,0.492659
033B-AP04. eMTCT Missed Appointments,0.481061
033B-CD10a. Measles - Cases,0.443701


In [27]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Assuming the target variable is '033B-CD01a. Malaria (diagnosed) - Cases'

X = df.drop('033B-CD01a. Malaria (diagnosed)  - Cases', axis='columns')
y = df['033B-CD01a. Malaria (diagnosed)  - Cases']

# Standardizing the data
scaler = MinMaxScaler(feature_range=(0,1))
X_scaled = scaler.fit_transform(X)

# Using SelectKBest to find the top features
selector = SelectKBest(score_func=chi2, k=10)
selector.fit(X_scaled, y)

# Getting the selected features
selected_features = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_features]

print("Selected feature indices:", selected_features)
print("Selected feature names:", selected_feature_names)


Selected feature indices: [ 0  1  4  5  8  9 11 12 14 17]
Selected feature names: Index(['033B-AP01. OPD New', '033B-AP02. Total OPD',
       '033B-CD01b. Malaria (diagnosed) - Deaths',
       '033B-CD02a. Dysentery  - Cases',
       '033B-CD04a. Acute Flaccid Paralysis - Cases',
       '033B-CD05a. AEFI - Cases', '033B-CD07a. Bacterial Meningitis  - Cases',
       '033B-CD10a. Measles - Cases', '033B-CD13b. Typhoid Fever- Deaths',
       'year'],
      dtype='object')


In [28]:
selected_feature_names

Index(['033B-AP01. OPD New', '033B-AP02. Total OPD',
       '033B-CD01b. Malaria (diagnosed) - Deaths',
       '033B-CD02a. Dysentery  - Cases',
       '033B-CD04a. Acute Flaccid Paralysis - Cases',
       '033B-CD05a. AEFI - Cases', '033B-CD07a. Bacterial Meningitis  - Cases',
       '033B-CD10a. Measles - Cases', '033B-CD13b. Typhoid Fever- Deaths',
       'year'],
      dtype='object')

In [29]:
X = df[selected_feature_names] 

In [30]:
y

7      647512.0
4      873292.0
6      995554.0
3      939092.0
1     1169201.0
8      729655.0
10     616163.0
11     479413.0
Name: 033B-CD01a. Malaria (diagnosed)  - Cases, dtype: float64

In [31]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, confusion_matrix

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=0)
# create pipeline
# estimators = []
# estimators.append(('standardize', StandardScaler()))
# estimators.append(('RF', RandomForestRegressor()))
# model = Pipeline(estimators)
kfold = KFold(n_splits=7)

# create the sub models
estimators = []
model1 = RandomForestRegressor()
estimators.append(('Random Forest', model1))

model2 = LinearRegression()
estimators.append(('LinearRegression', model2))

model3 = CatBoostRegressor()
estimators.append(('CatBoostRegressor', model3))

# model4 = LGBMRegressor()
# estimators.append(('LGBMRegressor', model4))

model5 = AdaBoostRegressor()
estimators.append(('AdaBoostRegressor', model5))

# create the ensemble model
ensemble = VotingRegressor(estimators)
results = cross_val_score(ensemble, X, y, cv=kfold, scoring='neg_mean_squared_error')
print(results.mean())


Learning rate set to 0.018244
0:	learn: 233956.4474290	total: 81.4ms	remaining: 1m 21s
1:	learn: 232554.8429390	total: 81.5ms	remaining: 40.7s
2:	learn: 231204.7048532	total: 81.7ms	remaining: 27.1s
3:	learn: 229997.5507385	total: 81.8ms	remaining: 20.4s
4:	learn: 228576.8598527	total: 81.9ms	remaining: 16.3s
5:	learn: 227406.1828790	total: 82ms	remaining: 13.6s
6:	learn: 225915.2025386	total: 82.1ms	remaining: 11.6s
7:	learn: 224638.6735763	total: 82.2ms	remaining: 10.2s
8:	learn: 223481.9612309	total: 82.3ms	remaining: 9.07s
9:	learn: 222527.5961047	total: 82.8ms	remaining: 8.2s
10:	learn: 221381.9901760	total: 83ms	remaining: 7.46s
11:	learn: 219933.5765462	total: 83.1ms	remaining: 6.84s
12:	learn: 218428.0395201	total: 83.2ms	remaining: 6.31s
13:	learn: 217196.3971888	total: 83.3ms	remaining: 5.86s
14:	learn: 216205.7644130	total: 83.4ms	remaining: 5.48s
15:	learn: 215150.2545255	total: 83.5ms	remaining: 5.14s
16:	learn: 214168.9542065	total: 83.6ms	remaining: 4.83s
17:	learn: 2131

In [33]:
print(results.mean())

-3.2588949208884108e+16


In [34]:
pred_data = val_data.drop('033B-CD01a. Malaria (diagnosed)  - Cases', axis='columns')

In [35]:
ensemble.fit(X,y)

Learning rate set to 0.019093
0:	learn: 211997.1917624	total: 612us	remaining: 612ms
1:	learn: 210967.7167363	total: 1.1ms	remaining: 552ms
2:	learn: 209196.6252528	total: 1.25ms	remaining: 417ms
3:	learn: 208170.8915031	total: 1.49ms	remaining: 371ms
4:	learn: 207177.2398266	total: 1.98ms	remaining: 394ms
5:	learn: 205790.4828969	total: 2.22ms	remaining: 368ms
6:	learn: 204689.1750169	total: 2.33ms	remaining: 331ms
7:	learn: 203669.3467437	total: 2.48ms	remaining: 307ms
8:	learn: 202752.2612000	total: 2.79ms	remaining: 307ms
9:	learn: 201624.8528680	total: 3.03ms	remaining: 300ms
10:	learn: 200706.5762065	total: 3.49ms	remaining: 314ms
11:	learn: 199685.3964324	total: 4.17ms	remaining: 343ms
12:	learn: 198576.2200802	total: 4.98ms	remaining: 378ms
13:	learn: 197431.9351815	total: 5.3ms	remaining: 373ms
14:	learn: 196082.0311923	total: 5.58ms	remaining: 367ms
15:	learn: 195146.0826664	total: 6.06ms	remaining: 373ms
16:	learn: 194038.9734591	total: 6.55ms	remaining: 379ms
17:	learn: 192

In [36]:
pred_data.head()

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-AP03. Expected eMTCT Mothers in Appt,033B-AP04. eMTCT Missed Appointments,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD02d. Dysentery - Cases Positive,033B-CD03a. SARI - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD06a. Animal bites(Suspected rabies) - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13a. Typhoid Fever - Cases,033B-CD13b. Typhoid Fever- Deaths,033B-CD14a. Hepatitis B - Cases,month,year
1,3267076.0,3433374.0,46461.0,8072.0,685.0,879.0,823.0,743.0,24.0,33.0,1621.0,37.0,601.0,9091.0,22.0,591.0,6,2023
8,2461891.0,2726887.0,38542.0,7338.0,278.0,1063.0,1001.0,1453.0,38.0,32.0,1354.0,62.0,678.0,8137.0,5.0,774.0,1,2024
10,803228968.0,2679901.0,37386.0,7095.0,150.0,1481.0,530.0,931.0,33.0,98.0,1123.0,38.0,560.0,6178.0,3.0,721.0,3,2024
11,2458392.0,2459682.0,36386.0,6879.0,233.0,1462.0,374.0,669.0,22.0,58.0,1355.0,35.0,395.0,5738.0,7.0,708.0,4,2024
9,3305745.0,3331504.0,48557.0,8833.0,522.0,1521.0,1262.0,1097.0,35.0,24.0,1460.0,49.0,528.0,9578.0,26.0,908.0,2,2024


In [37]:
X.head()

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13b. Typhoid Fever- Deaths,year
7,2078021.0,2144334.0,402.0,1013.0,26.0,13.0,52.0,241.0,33.0,2023
4,2147468.0,2336231.0,297.0,921.0,46.0,19.0,29.0,248.0,22.0,2023
6,3049091.0,3286098.0,514.0,1583.0,39.0,32.0,39.0,1083.0,29.0,2023
3,3019946.0,3282068.0,227.0,1321.0,40.0,26.0,30.0,587.0,18.0,2023
1,3267076.0,3433374.0,685.0,879.0,24.0,33.0,37.0,601.0,22.0,2023


In [38]:
X.shape

(8, 10)

In [39]:
pred_data.shape

(8, 18)

In [40]:
val = pred_data[selected_feature_names]
val.head()

Unnamed: 0,033B-AP01. OPD New,033B-AP02. Total OPD,033B-CD01b. Malaria (diagnosed) - Deaths,033B-CD02a. Dysentery - Cases,033B-CD04a. Acute Flaccid Paralysis - Cases,033B-CD05a. AEFI - Cases,033B-CD07a. Bacterial Meningitis - Cases,033B-CD10a. Measles - Cases,033B-CD13b. Typhoid Fever- Deaths,year
1,3267076.0,3433374.0,685.0,879.0,24.0,33.0,37.0,601.0,22.0,2023
8,2461891.0,2726887.0,278.0,1063.0,38.0,32.0,62.0,678.0,5.0,2024
10,803228968.0,2679901.0,150.0,1481.0,33.0,98.0,38.0,560.0,3.0,2024
11,2458392.0,2459682.0,233.0,1462.0,22.0,58.0,35.0,395.0,7.0,2024
9,3305745.0,3331504.0,522.0,1521.0,35.0,24.0,49.0,528.0,26.0,2024


In [41]:
val.shape

(8, 10)

In [42]:
pred= ensemble.score(val, y)
pred

-1.5562255340427313

In [43]:
y_pred = ensemble.predict(val)

In [44]:
print(y_pred)

[1132614.00088451  733632.22645245  633117.58536583  502920.51503424
  865932.37129355  931537.66005304  946847.52928862  812974.78831762]


In [45]:
print(y)

7      647512.0
4      873292.0
6      995554.0
3      939092.0
1     1169201.0
8      729655.0
10     616163.0
11     479413.0
Name: 033B-CD01a. Malaria (diagnosed)  - Cases, dtype: float64


In [46]:
MAE  = mean_absolute_error(y, y_pred)
print(MAE)

324095.91004971403


In [47]:
err = mean_absolute_percentage_error(y, y_pred)
print(f"{round(err*100,1)}%")

43.8%


In [48]:
import joblib

# Save the model to a file
joblib.dump(ensemble, 'Malaria_prediction_model.pkl')
# Load the model from the file
#loaded_model = joblib.load('Malaria_prediction_model.pkl')

['Malaria_prediction_model.pkl']

In [49]:
#import cv2