# Autogluon Classifier

Install Autogluon and Bokeh(2.0.1) for Autogluon plots. Autogluon works with Python 3.8 to 3.11

In [1]:
import numpy as np
import pandas as pd
import autogluon
from autogluon.tabular import TabularDataset
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import bokeh

In [2]:
deals= pd.read_csv('Sample_Data_Deals2.csv')
#Changbe column type from Object to Category for columns 3,4,5
for col in ['Country', 'Industry', 'Deal Status']:
    deals[col] = deals[col].astype('category')

In [3]:
deals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   OrderID        100 non-null    object  
 1   OrderQuantity  100 non-null    int64   
 2   OrderValue     100 non-null    int64   
 3   Country        100 non-null    category
 4   Industry       100 non-null    category
 5   Deal Status    100 non-null    category
dtypes: category(3), int64(2), object(1)
memory usage: 3.4+ KB


In [4]:
#Drop Order ID
deals1 = pd.DataFrame(deals.iloc[:,1:])
deals1.head()

Unnamed: 0,OrderQuantity,OrderValue,Country,Industry,Deal Status
0,371,383,Canada,Technology,Won
1,163,121,Canada,Finance,Won
2,191,117,Australia,Manufacturing,Lost
3,150,143,Australia,Manufacturing,Lost
4,165,148,Australia,Manufacturing,Lost


In [5]:
#Encoding categorical data, 
Country = {'Australia': 1, 'Canada':2, 'China':3, 'France':4, 'Germany':5}
deals1['Country'] = deals1['Country'].map(Country)

Industry = {'Energy':1, 'Finance':2, 'Government':3, 'Healthcare':4, 'Manufacturing':5, 'Retail':6, 'Technology':7}
deals1['Industry'] = deals1['Industry'].map(Industry)

dealstat= {'Won':1, 'Lost':0}
deals1['Deal Status'] = deals1['Deal Status'].map(dealstat)

print(deals1.head())

   OrderQuantity  OrderValue Country Industry Deal Status
0            371         383       2        7           1
1            163         121       2        2           1
2            191         117       1        5           0
3            150         143       1        5           0
4            165         148       1        5           0


In [6]:
#Convert the dataset into an Autogluon Tabular Dataset
df = TabularDataset(deals1)
df_train, df_test = train_test_split(df, test_size=0.25, shuffle=False)
df_train.head()

Unnamed: 0,OrderQuantity,OrderValue,Country,Industry,Deal Status
0,371,383,2,7,1
1,163,121,2,2,1
2,191,117,1,5,0
3,150,143,1,5,0
4,165,148,1,5,0


In [28]:
# Define the target column
label = 'Deal Status'
# Create a TabularPredictor object
predictor = TabularPredictor(label=label, path='autogluon_model')
TabularPredictor

# Train the predictor
predictor.fit(train_data=df_train, 
              time_limit=120) # Adjust time_limit (in seconds) as needed

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          12
Memory Avail:       15.23 GB / 31.64 GB (48.1%)
Disk Space Avail:   367.45 GB / 475.50 GB (77.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with very fast inference speed.
	presets='mediu

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x21b8c40c7d0>

In [31]:
# Evaluate on the validation set
leaderboard = predictor.leaderboard(df_test, silent=True) 
print(leaderboard.head(3)) 

             model  score_test  score_val eval_metric  pred_time_test  \
0         CatBoost        0.72   0.600000    accuracy         0.00000   
1   NeuralNetTorch        0.72   0.600000    accuracy         0.00000   
2  NeuralNetFastAI        0.68   0.666667    accuracy         0.01058   

   pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  \
0        0.00000  13.222656                  0.00000                 0.00000   
1        0.01572   0.901883                  0.00000                 0.01572   
2        0.00000   0.730980                  0.01058                 0.00000   

   fit_time_marginal  stack_level  can_infer  fit_order  
0          13.222656            1       True          7  
1           0.901883            1       True         12  
2           0.730980            1       True         10  


In [32]:
predictor.feature_importance(df_test)

Computing feature importance via permutation shuffling for 4 features using 25 rows with 5 shuffle sets...
	0.49s	= Expected runtime (0.1s per shuffle set)
	0.21s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
OrderQuantity,0.272,0.081976,0.000881,5,0.440789,0.103211
OrderValue,0.176,0.082946,0.004503,5,0.346786,0.005214
Industry,0.024,0.021909,0.035242,5,0.069111,-0.021111
Country,0.0,0.0,0.5,5,0.0,0.0


In [33]:
# Predict on the test data
predictions = predictor.predict(df_test)
# Print the predictions
print(predictions)

75    0
76    0
77    0
78    0
79    0
80    0
81    0
82    1
83    1
84    1
85    1
86    1
87    0
88    0
89    0
90    1
91    0
92    1
93    0
94    1
95    0
96    0
97    1
98    1
99    0
Name: Deal Status, dtype: object
