In [1]:
!pip install autogluon.tabular[all]



In [2]:
# import libraries
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# load dataset
df = pd.read_csv('../input/zikavirusdataset/classification_dataset.csv')
df.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month,Target
0,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
1,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
2,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,1
3,12625.800781,-34.603684,-58.381559,24,7,0.0,11.6,87,6,16,0,59,0.0,1018,10,189,2016,3,0
4,12625.800781,-34.603684,-58.381559,22,10,0.0,11.5,76,10,27,34,72,0.0,1025,10,99,2016,3,0


In [4]:
# Rows and cols
df.shape

(155223, 19)

In [5]:
# toal duplicate values
df.duplicated().sum()

79878

In [6]:
# Remove duplicate rows
df = df.drop_duplicates(keep='first', inplace=False)

In [7]:
df.shape

(75345, 19)

# Train Test Split

In [8]:
train_data,test_data = train_test_split(df,test_size=0.40,random_state=0,stratify=df['Target'] ) #train test split in 60:40 ratio

In [9]:
train_data.shape, test_data.shape

((45207, 19), (30138, 19))

In [10]:
train_data['Target'].value_counts(), test_data['Target'].value_counts()

(0    24440
 1    20767
 Name: Target, dtype: int64,
 0    16294
 1    13844
 Name: Target, dtype: int64)

# Build Model

In [11]:
label = 'Target'
save_path = 'agModels-predictClass'
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, presets='best_quality',ag_args_fit={'num_gpus': 1})

In [12]:
results = predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.853784      34.229328  3187.731263                0.120057          30.813571            3       True         26
1     ExtraTreesEntr_BAG_L2   0.850598      28.322572  2141.356989                1.713307           9.604133            2       True         21
2   RandomForestGini_BAG_L2   0.848652      27.994927  2163.408775                1.385662          31.655919            2       True         17
3   RandomForestEntr_BAG_L2   0.848453      27.985330  2185.248328                1.376065          53.495471            2       True         18
4     ExtraTreesGini_BAG_L2   0.848276      28.305440  2140.126373                1.696174           8.373516            2       True         20
5      LightGBMLarge_BAG_L2   0.844029      26.822635  2164.444400  

In [13]:
predictor.leaderboard(train_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist_BAG_L1,0.784104,0.361382,1.934995,3.22504,0.216061,1.934995,3.22504,0.216061,1,True,2
1,RandomForestGini_BAG_L1,0.784104,0.404672,2.810728,2.922862,37.902973,2.810728,2.922862,37.902973,1,True,5
2,RandomForestEntr_BAG_L1,0.784104,0.403057,3.652112,2.732013,53.390428,3.652112,2.732013,53.390428,1,True,6
3,ExtraTreesGini_BAG_L1,0.784104,0.39328,5.57547,3.36025,16.873792,5.57547,3.36025,16.873792,1,True,8
4,ExtraTreesEntr_BAG_L1,0.784104,0.39255,5.717083,4.083572,20.45164,5.717083,4.083572,20.45164,1,True,9
5,KNeighborsUnif_BAG_L1,0.681443,0.469308,1.826344,3.350831,0.412336,1.826344,3.350831,0.412336,1,True,1
6,LightGBMLarge_BAG_L1,0.635632,0.612272,1.341859,0.456228,69.269711,1.341859,0.456228,69.269711,1,True,13
7,WeightedEnsemble_L2,0.634039,0.612693,9.961996,2.292665,999.621824,0.008739,0.399927,58.50384,2,True,14
8,LightGBM_BAG_L1,0.624107,0.611454,1.407339,0.373376,30.976702,1.407339,0.373376,30.976702,1,True,4
9,NeuralNetMXNet_BAG_L1,0.619152,0.586679,12.820146,4.050263,990.749181,12.820146,4.050263,990.749181,1,True,12


# Make Prediction

In [14]:
y_test = test_data[label]  # values to predict
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,density_per_km,latitude,longitude,maxtempC,mintempC,totalSnow_cm,sunHour,moon_illumination,DewPointC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,winddirDegree,year,month
85860,49.728394,4.15663,-76.287762,27,16,0.0,12.5,59,14,5,18,68,16.6,1015,9,207,2016,9
50844,59.433132,9.144527,-74.226294,31,23,0.0,11.9,100,24,8,57,85,6.2,1010,7,229,2016,5
138749,209.804016,15.594288,-90.149499,30,17,0.0,13.0,84,17,7,13,74,1.0,1012,8,140,2016,5
107305,147.182251,19.052706,-70.149226,27,18,0.0,6.2,64,19,8,77,82,1.5,1016,7,158,2016,4
154932,835.046448,18.335765,-64.896335,28,27,0.0,8.7,96,23,41,42,77,12.9,1017,9,82,2016,12


In [15]:
predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file

y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)

Predictions:  
 85860     0
50844     1
138749    1
107305    0
154932    1
         ..
76754     1
50884     1
141791    0
22127     1
60368     0
Name: Target, Length: 30138, dtype: int64


In [16]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.838111,0.853784,51.163851,34.229328,3187.731263,0.005069,0.120057,30.813571,3,True,26
1,NeuralNetFastAI_BAG_L2,0.837945,0.841817,41.073607,27.740223,2615.061406,5.808134,1.130958,483.30855,2,True,22
2,RandomForestGini_BAG_L2,0.83768,0.848652,36.493062,27.994927,2163.408775,1.227589,1.385662,31.655919,2,True,17
3,XGBoost_BAG_L2,0.837381,0.8429,36.589302,26.809273,2138.004544,1.323829,0.200008,6.251688,2,True,23
4,RandomForestEntr_BAG_L2,0.837381,0.848453,36.779042,27.98533,2185.248328,1.513569,1.376065,53.495471,2,True,18
5,LightGBMLarge_BAG_L2,0.837215,0.844029,36.425624,26.822635,2164.4444,1.160151,0.21337,32.691544,2,True,25
6,LightGBM_BAG_L2,0.837149,0.841418,36.016913,26.755566,2147.156479,0.75144,0.146301,15.403623,2,True,16
7,CatBoost_BAG_L2,0.83632,0.840998,35.384029,26.744916,2191.50424,0.118556,0.135651,59.751383,2,True,19
8,ExtraTreesGini_BAG_L2,0.835357,0.848276,37.784068,28.30544,2140.126373,2.518595,1.696174,8.373516,2,True,20
9,ExtraTreesEntr_BAG_L2,0.834793,0.850598,37.688566,28.322572,2141.356989,2.423093,1.713307,9.604133,2,True,21


In [17]:
pred_probs = predictor.predict_proba(test_data_nolab)
pred_probs.head(5)


Unnamed: 0,0,1
85860,0.604167,0.395833
50844,0.014574,0.985426
138749,0.066298,0.933702
107305,0.987675,0.012325
154932,0.030492,0.969508


In [18]:
from sklearn.metrics import accuracy_score
y_predi = predictor.predict(test_data)
score = accuracy_score(y_test,y_predi)
print(score)

0.8381113544362598


In [19]:
# Accuracy with NeuralNetFastAI_BAG_L2
from sklearn.metrics import accuracy_score
y_predi = predictor.predict(test_data, model='NeuralNetFastAI_BAG_L2')
score = accuracy_score(y_test,y_predi)
print(score)

0.8379454509257416


In [20]:
# Accuracy with LightGBM_BAG_L2
from sklearn.metrics import accuracy_score
y_predi = predictor.predict(test_data, model='WeightedEnsemble_L3')
score = accuracy_score(y_test,y_predi)
print(score)

0.8381113544362598


In [21]:
predictor.get_model_best()

'WeightedEnsemble_L3'