In [2]:
import pandas as pd

In [3]:
# Import CSV into a DataFrame
df = pd.read_csv('../Datasets/archive/ebola_2014_2016_clean.csv')
# print(df)

In [4]:
# Get the summary statistics 
statistics = df.describe()
print(statistics)

              Cases       Deaths
count   2477.000000  2485.000000
mean    2553.678644  1028.347686
std     4427.118148  1656.064372
min        0.000000     0.000000
25%        1.000000     0.000000
50%        8.000000     6.000000
75%     3657.000000  2386.000000
max    14122.000000  4806.000000


In [5]:
selected_cols = df.iloc[:, [2,3]]
col_stats = selected_cols.describe()
print(col_stats)

              Cases       Deaths
count   2477.000000  2485.000000
mean    2553.678644  1028.347686
std     4427.118148  1656.064372
min        0.000000     0.000000
25%        1.000000     0.000000
50%        8.000000     6.000000
75%     3657.000000  2386.000000
max    14122.000000  4806.000000


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('../Datasets/archive/ebola_2014_2016_clean.csv')

# Data preprocessing
df['Date'] = pd.to_datetime(df['Date'])
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['dayofweek'] = df['Date'].dt.dayofweek

# Feature engineering
df['cases_7day_avg'] = df['Cases'].rolling(window=7).mean()
df['deaths_7day_avg'] = df['Deaths'].rolling(window=7).mean()

# Drop rows with NaN values created by rolling average
df = df.dropna()

# Define the target variable (example criteria)
def classify_status(row):
    if row['Cases'] < 50 and row['Deaths'] < 5:
        return 'normal conditions'
    elif row['Cases'] >= 50 and row['Cases'] < 200:
        return 'emergence'
    elif row['Cases'] >= 200 and row['Cases'] < 1000:
        return 'epidemic'
    else:
        return 'pandemic'

df['status'] = df.apply(classify_status, axis=1)

# Define features and target variable
features = df[['Cases', 'Deaths', 'cases_7day_avg', 'deaths_7day_avg', 'month', 'day', 'dayofweek']]
target = df['status']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


                   precision    recall  f1-score   support

normal conditions       1.00      1.00      1.00       235
         pandemic       1.00      1.00      1.00       257

         accuracy                           1.00       492
        macro avg       1.00      1.00      1.00       492
     weighted avg       1.00      1.00      1.00       492

[[235   0]
 [  0 257]]


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('../Datasets/archive/ebola_2014_2016_clean.csv')

# Data preprocessing
df['date'] = pd.to_datetime(df['Date'])
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek

# Feature engineering
df['cases_7day_avg'] = df.groupby('Country')['Cases'].transform(lambda x: x.rolling(window=7).mean())
df['deaths_7day_avg'] = df.groupby('Country')['Deaths'].transform(lambda x: x.rolling(window=7).mean())

# Drop rows with NaN values created by rolling average
df = df.dropna()

# Define the target variable (example criteria)
def classify_status(row):
    if row['Cases'] < 50 and row['Deaths'] < 5:
        return 'normal conditions'
    elif row['Cases'] >= 50 and row['Cases'] < 200:
        return 'emergence'
    elif row['Cases'] >= 200 and row['Cases'] < 1000:
        return 'epidemic'
    else:
        return 'pandemic'

df['status'] = df.apply(classify_status, axis=1)

# Define features and target variable
features = df[['Country', 'Cases', 'Deaths', 'cases_7day_avg', 'deaths_7day_avg', 'month', 'day', 'dayofweek']]
target = df['status']

# One-hot encode the 'Country' feature
features = pd.get_dummies(features, columns=['Country'], drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                   precision    recall  f1-score   support

normal conditions       1.00      1.00      1.00       235
         pandemic       1.00      1.00      1.00       247

         accuracy                           1.00       482
        macro avg       1.00      1.00      1.00       482
     weighted avg       1.00      1.00      1.00       482

[[235   0]
 [  0 247]]
