In [46]:
#import packages 
import sklearn
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [47]:
#Load dataset

DIRECTORY_WHERE_THIS_FILE_IS = os.path.dirname(os.path.abspath("ml_stacking.md"))
DATA_PATH = os.path.join(DIRECTORY_WHERE_THIS_FILE_IS, "data/prepared_dataset.csv")
df1 = pd.read_csv(DATA_PATH)

In [48]:
print(df1.shape)
print(df1.head())

#drop the index 
df1 = df1.drop("Unnamed: 0", axis = 1)

(791010, 24)
   Unnamed: 0 provincia_iso       fecha  num_casos_x  num_casos_prueba_pcr  \
0           0             A  2020-02-01            0                     0   
1           1             A  2020-02-01            0                     0   
2           2             A  2020-02-01            0                     0   
3           3             A  2020-02-01            0                     0   
4           4             A  2020-02-01            0                     0   

   num_casos_prueba_test_ac  num_casos_prueba_ag  num_casos_prueba_elisa  \
0                         0                    0                       0   
1                         0                    0                       0   
2                         0                    0                       0   
3                         0                    0                       0   
4                         0                    0                       0   

   num_casos_prueba_desconocida  dayyear  ...  num_hosp num_u

In [49]:
#some insights
print("Number of provinces: {}".format(df1['provincia'].nunique()))
print("Approximate number of days by region: {}".format(df1['fecha'].nunique()))
print("Number of distinct age groups: {}".format(df1['grupo_edad'].nunique()))
print('')
print(517*51*10)
print(df1.describe())

Number of provinces: 51
Approximate number of days by region: 517
Number of distinct age groups: 10

263670
         num_casos_x  num_casos_prueba_pcr  num_casos_prueba_test_ac  \
count  791010.000000         791010.000000             791010.000000   
mean      141.604013            108.366974                  0.178367   
std       365.537805            286.652506                  1.234456   
min         0.000000              0.000000                  0.000000   
25%         8.000000              6.000000                  0.000000   
50%        43.000000             32.000000                  0.000000   
75%       124.000000             98.000000                  0.000000   
max      7017.000000           6602.000000                 32.000000   

       num_casos_prueba_ag  num_casos_prueba_elisa  \
count        791010.000000           791010.000000   
mean             30.447302                0.576516   
std             122.692577                5.044326   
min               0.000000 

## Machine learning

### Preprocessing

In [133]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
df1.columns

Index(['provincia_iso', 'fecha', 'num_casos_x', 'num_casos_prueba_pcr',
       'num_casos_prueba_test_ac', 'num_casos_prueba_ag',
       'num_casos_prueba_elisa', 'num_casos_prueba_desconocida', 'dayyear',
       'year', 'sexo', 'grupo_edad', 'num_casos_y', 'num_hosp', 'num_uci',
       'num_def', 'people_fully_vaccinated_per_hundred', 'France_cases_mil',
       'Portugal_cases_mil', 'date', 'provincia', 'poblacion', 'Communidad'],
      dtype='object')

In [92]:
#remove correlated features
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

In [149]:
def Prepare_dataset(df = df1,target = "num_casos_x",test = 0.2,dimension_reduction = False,scale = True):
    
    #drop the province name and date
    df = df.drop(labels=['provincia','fecha','date'], axis = 1)
    
    #scale ( not the dummies or target)
    col = ['provincia_iso','Communidad','grupo_edad','sexo','year','dayyear',target]

  
    if scale == True:
        col_s = ['num_casos_x', 'num_casos_prueba_pcr',
       'num_casos_prueba_test_ac', 'num_casos_prueba_ag',
       'num_casos_prueba_elisa', 'num_casos_prueba_desconocida','num_casos_y', 'num_hosp', 'num_uci',
       'num_def', 'people_fully_vaccinated_per_hundred', 'France_cases_mil',
       'Portugal_cases_mil',  'poblacion']
        S = pd.DataFrame(df.loc[:,col_s])
        scaler = StandardScaler().fit(S)
        S = pd.DataFrame(scaler.transform(pd.DataFrame(S)))
        df = pd.concat([df, S], axis=1)
    
    
    #dumify state and regions  
    df = pd.get_dummies(df, columns=['provincia_iso','Communidad','grupo_edad','sexo'], prefix = ['province_','communidad_','age_','gender_'],drop_first=True)
    
    
    #x y split
    y = df[target]
    X = df.loc[:,df.columns != target]
    
    
    if dimension_reduction == False:
        #delete highly correlated features
        corr_features = get_correlation(X, 0.80)
        X = X.drop(labels=corr_features, axis = 1)
    else:
        #PCA dimension reduction
        pca = PCA(n_components= len(X.columns) ) #covariant Matrix
        x_pca = pca.fit_transform(X)
        variance = pca.explained_variance_ratio_ #calculate variance ratios
        var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
        x_pca = pd.DataFrame(x_pca)
        for el in range(0,len(x_pca)):
            print(el)
            if x_pca[el] < 65:
                X = x_pca.drop(labels = el, axis = 1)
    #train test
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test, random_state=42)
    
    #prepare weak learner dataset
    weak_leaner = pd.DataFrame()
    weak_leaner['targetTRUE'] = df[target]
    return X_train, X_test, y_train, y_test,weak_leaner

In [150]:
Prepare_dataset(dimension_reduction = True)

MemoryError: Unable to allocate 646. MiB for an array with shape (791010, 107) and data type float64

## Random forest 

In [268]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV





random_grid = {'n_estimators': [200,300,500,800,1300,1500],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [10,30,50,80],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4]}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 2, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)
rf_random.best_params_

Fitting 2 folds for each of 5 candidates, totalling 10 fits


{'n_estimators': 500,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 80}

'n_estimators': 500,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 80}

In [269]:
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 500,min_samples_split = 10,
 min_samples_leaf= 1,
 max_features = 'sqrt',
 max_depth = 80)  
  
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
  
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
  
# metrics are used to find accuracy or error
from sklearn import metrics  
print()
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL:  0.7538699690402477


In [270]:
weak_leaner['targetrandomF'] = y_pred

In [271]:
df1['jets'] = weak_leaner['targetrandomF']

## Ridge regression

In [272]:
from sklearn.linear_model import LogisticRegression

In [273]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print()
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL:  0.6617647058823529


In [274]:
weak_leaner['targetridge'] = y_pred

## SVM 

In [275]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline


param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, cv = 3)
  
#### fitting the model for grid search
grid.fit(X_train, y_train)
grid.best_params_

In [276]:
csvm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
csvm.fit(X_train, y_train)


y_pred = csvm.predict(X_test)

print()
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))



ACCURACY OF THE MODEL:  0.7360681114551083


In [277]:
weak_leaner['targetsvm'] = y_pred

In [278]:
df1['jets'] = weak_leaner['targetsvm']

## Nearest neighbor

In [279]:
from sklearn.neighbors import NearestCentroid
neigh = NearestCentroid()
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

print()
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL:  0.6110681114551083


In [280]:
weak_leaner['targetneigh'] = y_pred

## PCA LDA 

In [281]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 

In [283]:
pca = PCA(n_components=13) #covariant Matrix
x_pca = pca.fit_transform(X)
df_pca =  pca.fit_transform(df9)
variance = pca.explained_variance_ratio_ #calculate variance ratios
var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
print(var)

[ 15.6  28.1  36.6  44.8  52.6  60.3  68.   75.5  83.   90.   94.8  98.6
 100.2]


In [284]:
x_pca = pd.DataFrame(x_pca)
x_pca = x_pca.drop(labels = [0,1,2,3,4,5], axis = 1)


In [285]:
df_pca  = pd.DataFrame(df_pca)
df_pca = df_pca.drop(labels = [0,1,2,3,4,5], axis = 1)

In [286]:
#train test
X_train, X_test, y_train, y_test = train_test_split(
   x_pca, y, test_size=0.20, random_state=42)

In [287]:
model = LinearDiscriminantAnalysis()
model.fit(X_train, y_train)

LinearDiscriminantAnalysis()

In [288]:
y_pred = model.predict(X_test)
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
#weak_leaner['targetlda'] = y_pred

ACCURACY OF THE MODEL:  0.6408668730650154


## Strong learners

In [289]:
#data 
weak_leaner.head()

Unnamed: 0,targetrandomF,targetridge,targetsvm,targetneigh
0,bijet,bijet,bijet,tetrajet
1,trijet,trijet,trijet,tetrajet
2,trijet,trijet,trijet,tetrajet
3,bijet,bijet,bijet,bijet
4,bijet,bijet,bijet,trijet


In [290]:
weak_leaner = pd.get_dummies(weak_leaner, columns=['targetrandomF','targetridge',
                                                   'targetneigh','targetsvm'], prefix = ['random_','ridge_','neigh_','svm_']
                             ,drop_first=True)


In [291]:
weak_leaner

Unnamed: 0,random__tetrajet,random__trijet,ridge__tetrajet,ridge__trijet,neigh__tetrajet,neigh__trijet,svm__tetrajet,svm__trijet
0,0,0,0,0,1,0,0,0
1,0,1,0,1,1,0,0,1
2,0,1,0,1,1,0,0,1
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
2579,0,0,0,0,0,0,0,0
2580,0,1,0,0,0,0,0,1
2581,0,1,0,1,1,0,0,1
2582,0,1,0,0,0,1,0,1


In [292]:
y = weak_leaner["targetTRUE"]
X = weak_leaner.loc[:,weak_leaner.columns != "targetTRUE"]

KeyError: 'targetTRUE'

In [None]:
#train test
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.40, random_state=42)

## AdaBoost

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
boost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.5,max_depth=50, random_state=0).fit(X_train, y_train)

In [None]:
y_pred = boost.predict(X_test)

print()
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

## Neural network

In [None]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                  hidden_layer_sizes=(5, 2), random_state=1, max_iter=150)
nn.fit(X_train, y_train)

y_pred = nn.predict(X_test)

print()
  
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

In [None]:
weak_leaner['targetnn'] = y_pred
df1['jets'] = weak_leaner['targetnn']

# Submit

In [None]:
df1.to_csv('submit7.csv',index=False,header = True)