# Ensure proper python libraries installed

In [10]:
import pycaret
print(pycaret.__version__)

3.3.2


In [13]:
import pandas 
print(pandas.__version__)

1.4.4


# Import data

In [17]:
import pandas as pd

# Relative path to the CSV file
trainSet = pd.read_csv('Comment_Sample_With_Stance_Similarity.csv')

# Display the first few rows of the DataFrame
print(trainSet.head())

           Document ID        Posted Date        Submitter  \
0  DEA-2024-0059-32951  2024-07-18T04:00Z    Nathan Fisher   
1   DEA-2024-0059-9342  2024-05-31T04:00Z        Anonymous   
2  DEA-2024-0059-20636  2024-06-18T04:00Z       Lyn Shoots   
3  DEA-2024-0059-22701  2024-06-19T04:00Z           Nate C   
4  DEA-2024-0059-23753  2024-06-21T04:00Z  Carol McFarland   

                                             Comment  \
0  Respectfully, end the Drug War!\n\nTo Whom it ...   
1  I work at a small cannabis business in MI\n\nW...   
2  Official Comment Drug Enforcement Agency ,\n\n...   
3  As a person who has been living with epilepsy ...   
4  Hello. 70 year old retired public schools prin...   

                       Stance  Comment Similarity Score  
0  Supports Beyond Schedule 3                  0.993366  
1  Supports Beyond Schedule 3                  0.131234  
2                    Supports                  0.614912  
3  Supports Beyond Schedule 3                  0.182440  


# Create Class Imbalance Methods List

In [18]:
import re
from pycaret.classification import setup, compare_models, pull

try:
    # Intentionally set an invalid value for fix_imbalance_method
     setup(
        data=trainSet,
        target='Stance',
        session_id=123,
        text_features=['Comment'],
        fix_imbalance=True,
        fix_imbalance_method='invalid_value'
    )
except ValueError as e:
    # Extract the valid options from the error message
    match = re.search(r"Choose from: (.*)\.", str(e))
    if match:
        imbalance_methods = match.group(1).split(', ')
        imbalance_methods = [option.strip() for option in imbalance_methods]
        print("Available fix_imbalance_method options:")
        print(imbalance_methods)


Available fix_imbalance_method options:
['condensednearestneighbour', 'editednearestneighborus', 'repeatededitednearestneighbours', 'allknn', 'instancehardnessthreshold', 'nearmiss', 'neighbourhoodcleaningrule', 'onesidedselection', 'randomundersampler', 'tomeklinks', 'randomoversampler', 'smote', 'smotenc', 'smoten', 'adasyn', 'borderlinesmote', 'kmeanssmote', 'svmsmote', 'smoteenn', 'smotetomek']


# Compare_Models for each Imbalance Method 

In [19]:
# Initialize an empty DataFrame to store the Compare Model results
results = pd.DataFrame()

# Iterate over imbalance methods
for method in imbalance_methods:
    try:
        # Initialize PyCaret setup for classification
        setup(data=trainSet,
                  target='Stance', 
                  session_id=123, 
                  text_features=['Comment'],
                  ignore_features=['Document ID','Posted Date','Submitter','Comment Similarity Score'],
                  fix_imbalance=True,
                  fix_imbalance_method=method, 
                  preprocess=True)

        # Compare different models
        model_results = compare_models()

        model_results = pull()

        # Add the method as a new column
        model_results['method'] = method

        # Append results to the DataFrame
        results = pd.concat([results, model_results], ignore_index=True)

    except TypeError as e:
        print(f"TypeError encountered for method {method}: {e}")
        continue

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(475, 7124)"
6,Transformed train set shape,"(175, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7129,0.0,0.7129,0.7268,0.6789,0.4737,0.5113,2.141
ridge,Ridge Classifier,0.7071,0.0,0.7071,0.7296,0.6984,0.498,0.511,1.706
et,Extra Trees Classifier,0.6529,0.7647,0.6529,0.6497,0.639,0.407,0.415,1.725
svm,SVM - Linear Kernel,0.6329,0.0,0.6329,0.6556,0.6321,0.3956,0.4033,1.677
rf,Random Forest Classifier,0.6257,0.7635,0.6257,0.6456,0.6219,0.3837,0.3926,1.695
nb,Naive Bayes,0.6029,0.6734,0.6029,0.5998,0.5926,0.3316,0.3369,1.906
lda,Linear Discriminant Analysis,0.6,0.0,0.6,0.6101,0.5839,0.327,0.3452,1.737
lightgbm,Light Gradient Boosting Machine,0.5771,0.7468,0.5771,0.638,0.5821,0.3311,0.3463,1.806
gbc,Gradient Boosting Classifier,0.5471,0.0,0.5471,0.6166,0.5541,0.2981,0.3158,2.252
dummy,Dummy Classifier,0.5357,0.5,0.5357,0.287,0.3738,0.0,0.0,1.681


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(676, 7124)"
6,Transformed train set shape,"(376, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7271,0.0,0.7271,0.7154,0.6748,0.4707,0.549,1.567
lightgbm,Light Gradient Boosting Machine,0.7214,0.2583,0.7214,0.7061,0.6699,0.4626,0.5357,0.577
ridge,Ridge Classifier,0.7143,0.0,0.7143,0.7115,0.658,0.4417,0.5293,0.301
svm,SVM - Linear Kernel,0.7129,0.0,0.7129,0.6974,0.6606,0.4451,0.5215,0.31
knn,K Neighbors Classifier,0.7114,0.2209,0.7114,0.6926,0.6614,0.4575,0.5167,0.295
rf,Random Forest Classifier,0.71,0.2553,0.71,0.7096,0.6527,0.4324,0.5222,0.316
et,Extra Trees Classifier,0.71,0.2505,0.71,0.7096,0.6527,0.4324,0.5222,0.299
dt,Decision Tree Classifier,0.7071,0.2128,0.7071,0.6745,0.6583,0.4476,0.4993,0.292
lr,Logistic Regression,0.7029,0.0,0.7029,0.7063,0.645,0.4155,0.5082,0.396
nb,Naive Bayes,0.7029,0.2119,0.7029,0.6941,0.646,0.4336,0.5054,0.313


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(676, 7124)"
6,Transformed train set shape,"(376, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7271,0.0,0.7271,0.7154,0.6748,0.4707,0.549,1.576
lightgbm,Light Gradient Boosting Machine,0.7214,0.2583,0.7214,0.7061,0.6699,0.4626,0.5357,0.603
ridge,Ridge Classifier,0.7143,0.0,0.7143,0.7115,0.658,0.4417,0.5293,0.311
svm,SVM - Linear Kernel,0.7129,0.0,0.7129,0.6974,0.6606,0.4451,0.5215,0.316
knn,K Neighbors Classifier,0.7114,0.2209,0.7114,0.6926,0.6614,0.4575,0.5167,0.308
rf,Random Forest Classifier,0.71,0.2553,0.71,0.7096,0.6527,0.4324,0.5222,0.311
et,Extra Trees Classifier,0.71,0.2505,0.71,0.7096,0.6527,0.4324,0.5222,0.306
dt,Decision Tree Classifier,0.7071,0.2128,0.7071,0.6745,0.6583,0.4476,0.4993,0.295
lr,Logistic Regression,0.7029,0.0,0.7029,0.7063,0.645,0.4155,0.5082,0.424
nb,Naive Bayes,0.7029,0.2119,0.7029,0.6941,0.646,0.4336,0.5054,0.329


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(807, 7124)"
6,Transformed train set shape,"(507, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7386,0.8824,0.7386,0.7278,0.7016,0.5114,0.5483,0.921
svm,SVM - Linear Kernel,0.7343,0.0,0.7343,0.7288,0.6909,0.4961,0.5497,0.358
ridge,Ridge Classifier,0.7257,0.0,0.7257,0.7142,0.6725,0.4672,0.5466,0.312
gbc,Gradient Boosting Classifier,0.7214,0.0,0.7214,0.7151,0.6778,0.471,0.5247,2.174
knn,K Neighbors Classifier,0.72,0.7646,0.72,0.7024,0.6717,0.4756,0.5302,0.314
dt,Decision Tree Classifier,0.7171,0.7275,0.7171,0.6897,0.6848,0.4867,0.5092,0.306
rf,Random Forest Classifier,0.7129,0.8585,0.7129,0.7017,0.6578,0.441,0.5208,0.333
lr,Logistic Regression,0.71,0.0,0.71,0.7094,0.6536,0.4312,0.5203,0.401
nb,Naive Bayes,0.7086,0.7023,0.7086,0.7027,0.6588,0.4483,0.5089,0.347
et,Extra Trees Classifier,0.7043,0.8488,0.7043,0.6944,0.6487,0.4244,0.5047,0.341


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(498, 7124)"
6,Transformed train set shape,"(198, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.6443,0.7479,0.6443,0.7425,0.6382,0.4083,0.4346,0.661
lr,Logistic Regression,0.5271,0.0,0.5271,0.8463,0.5868,0.3698,0.4517,0.694
ridge,Ridge Classifier,0.5257,0.0,0.5257,0.8574,0.587,0.3692,0.4546,0.654
svm,SVM - Linear Kernel,0.5186,0.0,0.5186,0.8351,0.5717,0.3638,0.4332,0.664
gbc,Gradient Boosting Classifier,0.4943,0.0,0.4943,0.808,0.5501,0.3395,0.3962,1.208
lda,Linear Discriminant Analysis,0.4914,0.0,0.4914,0.7072,0.5261,0.3081,0.3432,0.701
lightgbm,Light Gradient Boosting Machine,0.4914,0.8023,0.4914,0.804,0.5456,0.3366,0.392,0.762
dt,Decision Tree Classifier,0.4843,0.6983,0.4843,0.7864,0.5423,0.3229,0.3716,0.655
nb,Naive Bayes,0.4814,0.6931,0.4814,0.7886,0.5531,0.3134,0.3592,0.679
rf,Random Forest Classifier,0.4757,0.8009,0.4757,0.873,0.5403,0.3276,0.4319,0.684


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(428, 7124)"
6,Transformed train set shape,"(128, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.5686,0.7057,0.5686,0.5763,0.5577,0.2867,0.2909,0.242
svm,SVM - Linear Kernel,0.4214,0.0,0.4214,0.713,0.4115,0.2565,0.3155,0.234
lr,Logistic Regression,0.3986,0.0,0.3986,0.733,0.4025,0.2337,0.3026,0.269
ridge,Ridge Classifier,0.3986,0.0,0.3986,0.7348,0.4025,0.2356,0.3067,0.247
dt,Decision Tree Classifier,0.3971,0.6269,0.3971,0.6881,0.3842,0.223,0.2732,0.245
nb,Naive Bayes,0.3929,0.5883,0.3929,0.5415,0.3694,0.1724,0.2031,0.265
lightgbm,Light Gradient Boosting Machine,0.3929,0.6872,0.3929,0.7113,0.3868,0.232,0.2878,0.318
gbc,Gradient Boosting Classifier,0.3914,0.0,0.3914,0.6852,0.3863,0.2223,0.275,0.723
et,Extra Trees Classifier,0.3743,0.6608,0.3743,0.7205,0.3745,0.2128,0.2724,0.269
rf,Random Forest Classifier,0.3729,0.6914,0.3729,0.7247,0.3735,0.2113,0.2759,0.269


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(802, 7124)"
6,Transformed train set shape,"(502, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7286,0.8633,0.7286,0.7168,0.6843,0.4853,0.5413,0.944
svm,SVM - Linear Kernel,0.7257,0.0,0.7257,0.7043,0.6735,0.4698,0.5422,0.393
ridge,Ridge Classifier,0.7214,0.0,0.7214,0.7124,0.6668,0.4576,0.5395,0.361
rf,Random Forest Classifier,0.7157,0.8432,0.7157,0.7121,0.6596,0.4448,0.5317,0.371
gbc,Gradient Boosting Classifier,0.7157,0.0,0.7157,0.7039,0.6681,0.4573,0.5165,2.169
knn,K Neighbors Classifier,0.7143,0.7491,0.7143,0.6921,0.6655,0.465,0.5202,0.337
et,Extra Trees Classifier,0.7129,0.8376,0.7129,0.7084,0.6565,0.4392,0.5251,0.375
lr,Logistic Regression,0.7086,0.0,0.7086,0.7088,0.6519,0.4281,0.5179,0.426
nb,Naive Bayes,0.7071,0.699,0.7071,0.7106,0.6531,0.4422,0.5125,0.373
dt,Decision Tree Classifier,0.7014,0.6997,0.7014,0.6766,0.6578,0.4398,0.4834,0.342


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(822, 7124)"
6,Transformed train set shape,"(522, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.6971,0.0,0.6971,0.7141,0.6818,0.4906,0.5083,0.428
ridge,Ridge Classifier,0.6871,0.0,0.6871,0.6987,0.6673,0.4606,0.4796,0.329
gbc,Gradient Boosting Classifier,0.6871,0.0,0.6871,0.7027,0.6758,0.4704,0.4836,2.869
lightgbm,Light Gradient Boosting Machine,0.6871,0.8436,0.6871,0.7073,0.6777,0.4748,0.4897,1.002
rf,Random Forest Classifier,0.6757,0.8326,0.6757,0.6576,0.6407,0.4156,0.4503,0.362
lr,Logistic Regression,0.66,0.0,0.66,0.653,0.6267,0.3954,0.4239,0.473
et,Extra Trees Classifier,0.66,0.8174,0.66,0.6479,0.6256,0.3904,0.4276,0.354
dt,Decision Tree Classifier,0.6457,0.721,0.6457,0.6735,0.6442,0.4363,0.4462,0.294
knn,K Neighbors Classifier,0.6371,0.7734,0.6371,0.6076,0.5985,0.3663,0.3945,0.313
nb,Naive Bayes,0.6171,0.668,0.6171,0.6034,0.6022,0.3445,0.348,0.349


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(428, 7124)"
6,Transformed train set shape,"(128, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.6757,0.0,0.6757,0.7888,0.7042,0.5138,0.5388,0.197
svm,SVM - Linear Kernel,0.6571,0.0,0.6571,0.7276,0.6711,0.4818,0.4956,0.205
lr,Logistic Regression,0.6543,0.0,0.6543,0.7872,0.6859,0.4835,0.5117,0.234
gbc,Gradient Boosting Classifier,0.65,0.0,0.65,0.7241,0.6683,0.4645,0.4784,0.569
lightgbm,Light Gradient Boosting Machine,0.6357,0.8365,0.6357,0.7075,0.6526,0.4482,0.4614,0.252
nb,Naive Bayes,0.6286,0.7395,0.6286,0.7157,0.6497,0.4323,0.4451,0.218
rf,Random Forest Classifier,0.6271,0.8559,0.6271,0.7836,0.6659,0.4567,0.4929,0.226
knn,K Neighbors Classifier,0.6186,0.7805,0.6186,0.7175,0.6271,0.3933,0.4159,0.207
lda,Linear Discriminant Analysis,0.6057,0.0,0.6057,0.7076,0.6324,0.3963,0.4139,0.218
et,Extra Trees Classifier,0.6057,0.8634,0.6057,0.7765,0.6474,0.4333,0.4729,0.218


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(975, 7124)"
6,Transformed train set shape,"(675, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7814,0.9114,0.7814,0.7766,0.7656,0.6118,0.6233,1.343
svm,SVM - Linear Kernel,0.7686,0.0,0.7686,0.7599,0.7435,0.5846,0.6085,0.465
ridge,Ridge Classifier,0.7657,0.0,0.7657,0.7755,0.7353,0.5685,0.5998,0.326
gbc,Gradient Boosting Classifier,0.7557,0.0,0.7557,0.7496,0.7351,0.5636,0.5776,2.952
lr,Logistic Regression,0.73,0.0,0.73,0.7054,0.6827,0.4848,0.5387,0.448
rf,Random Forest Classifier,0.7286,0.8932,0.7286,0.7119,0.6818,0.4808,0.5415,0.361
dt,Decision Tree Classifier,0.7229,0.7689,0.7229,0.7266,0.7215,0.5369,0.5389,0.317
knn,K Neighbors Classifier,0.7157,0.8096,0.7157,0.6845,0.6691,0.4721,0.5161,0.3
et,Extra Trees Classifier,0.7157,0.8917,0.7157,0.6892,0.6643,0.4528,0.5168,0.374
ada,Ada Boost Classifier,0.6743,0.0,0.6743,0.6638,0.6462,0.4158,0.4417,1.097


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1800, 7124)"
6,Transformed train set shape,"(1500, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7886,0.9073,0.7886,0.7845,0.7781,0.6303,0.6368,2.866
svm,SVM - Linear Kernel,0.78,0.0,0.78,0.7756,0.7673,0.6196,0.6293,0.748
ridge,Ridge Classifier,0.7686,0.0,0.7686,0.7749,0.764,0.6052,0.6119,0.477
gbc,Gradient Boosting Classifier,0.7686,0.0,0.7686,0.7804,0.7702,0.6157,0.6182,5.723
lr,Logistic Regression,0.7671,0.0,0.7671,0.7954,0.7704,0.6137,0.6211,0.749
rf,Random Forest Classifier,0.7514,0.8942,0.7514,0.7547,0.7339,0.5542,0.5748,0.498
dt,Decision Tree Classifier,0.7229,0.7747,0.7229,0.7334,0.7238,0.5408,0.544,0.37
et,Extra Trees Classifier,0.72,0.8924,0.72,0.7043,0.6727,0.465,0.5237,0.563
knn,K Neighbors Classifier,0.6886,0.7929,0.6886,0.6888,0.6784,0.473,0.4784,0.381
nb,Naive Bayes,0.6086,0.6538,0.6086,0.5824,0.5833,0.3107,0.3166,0.451


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1800, 7124)"
6,Transformed train set shape,"(1500, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.7814,0.0,0.7814,0.7761,0.767,0.6201,0.6288,0.784
gbc,Gradient Boosting Classifier,0.7743,0.0,0.7743,0.7811,0.7725,0.6177,0.6221,7.369
lr,Logistic Regression,0.7671,0.0,0.7671,0.7987,0.77,0.6126,0.6221,0.814
ridge,Ridge Classifier,0.7657,0.0,0.7657,0.7776,0.7625,0.6022,0.6101,0.558
lightgbm,Light Gradient Boosting Machine,0.7643,0.9097,0.7643,0.7583,0.7513,0.5866,0.5945,5.271
rf,Random Forest Classifier,0.7486,0.8841,0.7486,0.7531,0.7226,0.5402,0.5705,0.549
et,Extra Trees Classifier,0.7443,0.8829,0.7443,0.7387,0.7069,0.521,0.5644,0.629
dt,Decision Tree Classifier,0.6829,0.7401,0.6829,0.6915,0.6827,0.4736,0.4772,0.45
nb,Naive Bayes,0.6114,0.6577,0.6114,0.5915,0.5873,0.3153,0.3212,0.531
ada,Ada Boost Classifier,0.5971,0.0,0.5971,0.635,0.5951,0.3399,0.3508,2.203


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

TypeError encountered for method smotenc: __init__() missing 1 required positional argument: 'categorical_features'


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1800, 7124)"
6,Transformed train set shape,"(1500, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7786,0.0,0.7786,0.788,0.7669,0.6114,0.6279,13.425
svm,SVM - Linear Kernel,0.7714,0.0,0.7714,0.7907,0.7691,0.6143,0.6234,13.416
gbc,Gradient Boosting Classifier,0.7686,0.0,0.7686,0.7605,0.7588,0.6043,0.608,18.699
lightgbm,Light Gradient Boosting Machine,0.7686,0.9071,0.7686,0.7723,0.7613,0.605,0.61,14.412
lr,Logistic Regression,0.7671,0.0,0.7671,0.7765,0.7591,0.5951,0.6073,13.362
et,Extra Trees Classifier,0.7229,0.8784,0.7229,0.7242,0.6832,0.4779,0.522,13.158
rf,Random Forest Classifier,0.71,0.8833,0.71,0.6978,0.6752,0.4642,0.4919,13.409
dt,Decision Tree Classifier,0.6957,0.74,0.6957,0.6864,0.6864,0.4812,0.4861,13.013
ada,Ada Boost Classifier,0.6229,0.0,0.6229,0.6218,0.6151,0.3655,0.372,15.022
nb,Naive Bayes,0.6143,0.6551,0.6143,0.5901,0.5893,0.3166,0.3228,13.51


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1813, 7124)"
6,Transformed train set shape,"(1513, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7757,0.9115,0.7757,0.7644,0.7624,0.6071,0.6143,5.724
lr,Logistic Regression,0.7743,0.0,0.7743,0.8012,0.7797,0.63,0.6361,0.922
svm,SVM - Linear Kernel,0.7657,0.0,0.7657,0.7703,0.7557,0.594,0.6026,0.886
ridge,Ridge Classifier,0.7629,0.0,0.7629,0.7707,0.7601,0.6019,0.6074,0.659
gbc,Gradient Boosting Classifier,0.76,0.0,0.76,0.7743,0.7611,0.5988,0.6039,7.776
et,Extra Trees Classifier,0.7471,0.8783,0.7471,0.7443,0.7124,0.5303,0.564,0.722
rf,Random Forest Classifier,0.7371,0.8807,0.7371,0.7318,0.7116,0.5247,0.5425,0.633
dt,Decision Tree Classifier,0.69,0.75,0.69,0.7064,0.6941,0.4917,0.4946,0.513
nb,Naive Bayes,0.61,0.6562,0.61,0.5874,0.586,0.3149,0.3206,0.704
knn,K Neighbors Classifier,0.5629,0.7684,0.5629,0.7339,0.5739,0.3902,0.4292,0.507


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1800, 7124)"
6,Transformed train set shape,"(1500, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7857,0.9076,0.7857,0.7846,0.7751,0.6266,0.634,4.909
lr,Logistic Regression,0.7829,0.0,0.7829,0.8002,0.7851,0.6397,0.6445,0.915
svm,SVM - Linear Kernel,0.7814,0.0,0.7814,0.7861,0.7666,0.6154,0.6285,0.897
ridge,Ridge Classifier,0.7714,0.0,0.7714,0.7761,0.7666,0.6116,0.6175,0.627
gbc,Gradient Boosting Classifier,0.77,0.0,0.77,0.779,0.7714,0.6181,0.62,7.37
rf,Random Forest Classifier,0.7414,0.8801,0.7414,0.7299,0.7208,0.5381,0.551,0.626
et,Extra Trees Classifier,0.7286,0.8707,0.7286,0.7198,0.6952,0.4974,0.5281,0.727
dt,Decision Tree Classifier,0.7029,0.754,0.7029,0.7135,0.7028,0.5048,0.5087,0.528
nb,Naive Bayes,0.6143,0.6584,0.6143,0.5923,0.5903,0.3206,0.3265,0.6
knn,K Neighbors Classifier,0.5543,0.7559,0.5543,0.7205,0.5609,0.3671,0.4093,0.529


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1801, 7124)"
6,Transformed train set shape,"(1501, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1593, 7124)"
6,Transformed train set shape,"(1293, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7786,0.0,0.7786,0.782,0.7682,0.6139,0.6243,6.912
lr,Logistic Regression,0.7743,0.0,0.7743,0.7838,0.76,0.5991,0.6148,6.335
gbc,Gradient Boosting Classifier,0.7686,0.0,0.7686,0.7647,0.7607,0.6011,0.6056,12.27
lightgbm,Light Gradient Boosting Machine,0.7671,0.9108,0.7671,0.7684,0.7531,0.5922,0.6004,10.211
svm,SVM - Linear Kernel,0.7643,0.0,0.7643,0.7717,0.7559,0.595,0.6034,6.164
et,Extra Trees Classifier,0.7329,0.8843,0.7329,0.6999,0.6895,0.4953,0.5412,5.989
rf,Random Forest Classifier,0.7314,0.89,0.7314,0.7158,0.6941,0.4995,0.5341,6.49
dt,Decision Tree Classifier,0.7043,0.7572,0.7043,0.7034,0.6987,0.5057,0.5089,6.495
nb,Naive Bayes,0.6114,0.6565,0.6114,0.586,0.5858,0.3152,0.3212,6.189
ada,Ada Boost Classifier,0.5914,0.0,0.5914,0.6129,0.5684,0.3125,0.3362,8.12


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1458, 7124)"
6,Transformed train set shape,"(1158, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.5957,0.0,0.5957,0.7603,0.6059,0.4214,0.4665,0.846
lightgbm,Light Gradient Boosting Machine,0.5886,0.8594,0.5886,0.7557,0.6008,0.4173,0.4618,3.556
gbc,Gradient Boosting Classifier,0.5829,0.0,0.5829,0.7595,0.5942,0.4135,0.4623,4.922
ridge,Ridge Classifier,0.5743,0.0,0.5743,0.7738,0.6014,0.4077,0.4561,0.725
dt,Decision Tree Classifier,0.5643,0.7111,0.5643,0.7174,0.5769,0.3814,0.4193,0.656
knn,K Neighbors Classifier,0.5443,0.742,0.5443,0.731,0.5589,0.3551,0.398,0.666
rf,Random Forest Classifier,0.5429,0.8365,0.5429,0.7513,0.5587,0.3714,0.4238,0.726
lr,Logistic Regression,0.5357,0.0,0.5357,0.7785,0.5701,0.3711,0.4294,0.942
et,Extra Trees Classifier,0.5314,0.8405,0.5314,0.7529,0.5541,0.3617,0.416,0.79
nb,Naive Bayes,0.5271,0.6524,0.5271,0.6077,0.5316,0.2815,0.3025,0.724


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Stance
2,Target type,Multiclass
3,Target mapping,"Opposed: 0, Supports: 1, Supports Beyond Schedule 3: 2, Unclear: 3"
4,Original data shape,"(1000, 6)"
5,Transformed data shape,"(1796, 7124)"
6,Transformed train set shape,"(1496, 7124)"
7,Transformed test set shape,"(300, 7124)"
8,Ignore features,4
9,Text features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7757,0.9097,0.7757,0.7695,0.7641,0.6081,0.6149,5.554
ridge,Ridge Classifier,0.77,0.0,0.77,0.7836,0.7672,0.609,0.6173,0.721
lr,Logistic Regression,0.7671,0.0,0.7671,0.8004,0.7709,0.6133,0.6229,1.021
svm,SVM - Linear Kernel,0.7671,0.0,0.7671,0.772,0.7578,0.5996,0.6076,0.975
gbc,Gradient Boosting Classifier,0.7643,0.0,0.7643,0.7788,0.7646,0.6037,0.6096,7.622
rf,Random Forest Classifier,0.7414,0.8804,0.7414,0.7417,0.7151,0.5267,0.5563,0.725
et,Extra Trees Classifier,0.7386,0.8818,0.7386,0.74,0.7008,0.5101,0.5552,0.82
dt,Decision Tree Classifier,0.6829,0.7406,0.6829,0.6922,0.684,0.4753,0.4776,0.633
nb,Naive Bayes,0.6114,0.6571,0.6114,0.5915,0.5873,0.3153,0.3212,0.705
ada,Ada Boost Classifier,0.5971,0.0,0.5971,0.635,0.5951,0.3399,0.3508,2.394


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

# Identify the top 10 accuracy scores

In [20]:
top_10_accuracy = results.nlargest(10, 'Accuracy')

print(top_10_accuracy)

                               Model  Accuracy     AUC  Recall   Prec.  \
140  Light Gradient Boosting Machine    0.7886  0.9073  0.7886  0.7845   
196  Light Gradient Boosting Machine    0.7857  0.9076  0.7857  0.7846   
197              Logistic Regression    0.7829  0.0000  0.7829  0.8002   
126  Light Gradient Boosting Machine    0.7814  0.9114  0.7814  0.7766   
154              SVM - Linear Kernel    0.7814  0.0000  0.7814  0.7761   
198              SVM - Linear Kernel    0.7814  0.0000  0.7814  0.7861   
141              SVM - Linear Kernel    0.7800  0.0000  0.7800  0.7756   
168                 Ridge Classifier    0.7786  0.0000  0.7786  0.7880   
210                 Ridge Classifier    0.7786  0.0000  0.7786  0.7820   
182  Light Gradient Boosting Machine    0.7757  0.9115  0.7757  0.7644   

         F1   Kappa     MCC  TT (Sec)             method  
140  0.7781  0.6303  0.6368     2.866  randomoversampler  
196  0.7751  0.6266  0.6340     4.909    borderlinesmote  
197  0.7

# Identify the top 10 recall scores 

In [21]:
top_10_recall = results.nlargest(10, 'Recall')

print(top_10_recall)

                               Model  Accuracy     AUC  Recall   Prec.  \
140  Light Gradient Boosting Machine    0.7886  0.9073  0.7886  0.7845   
196  Light Gradient Boosting Machine    0.7857  0.9076  0.7857  0.7846   
197              Logistic Regression    0.7829  0.0000  0.7829  0.8002   
126  Light Gradient Boosting Machine    0.7814  0.9114  0.7814  0.7766   
154              SVM - Linear Kernel    0.7814  0.0000  0.7814  0.7761   
198              SVM - Linear Kernel    0.7814  0.0000  0.7814  0.7861   
141              SVM - Linear Kernel    0.7800  0.0000  0.7800  0.7756   
168                 Ridge Classifier    0.7786  0.0000  0.7786  0.7880   
210                 Ridge Classifier    0.7786  0.0000  0.7786  0.7820   
182  Light Gradient Boosting Machine    0.7757  0.9115  0.7757  0.7644   

         F1   Kappa     MCC  TT (Sec)             method  
140  0.7781  0.6303  0.6368     2.866  randomoversampler  
196  0.7751  0.6266  0.6340     4.909    borderlinesmote  
197  0.7

# Identify the top 10 Precision scores 

In [22]:
top_10_prec = results.nlargest(10, 'Prec.')

print(top_10_prec)

                               Model  Accuracy     AUC  Recall   Prec.  \
65          Random Forest Classifier    0.4757  0.8009  0.4757  0.8730   
66            Extra Trees Classifier    0.4714  0.7901  0.4714  0.8675   
58                  Ridge Classifier    0.5257  0.0000  0.5257  0.8574   
57               Logistic Regression    0.5271  0.0000  0.5271  0.8463   
59               SVM - Linear Kernel    0.5186  0.0000  0.5186  0.8351   
60      Gradient Boosting Classifier    0.4943  0.0000  0.4943  0.8080   
62   Light Gradient Boosting Machine    0.4914  0.8023  0.4914  0.8040   
183              Logistic Regression    0.7743  0.0000  0.7743  0.8012   
240              Logistic Regression    0.7671  0.0000  0.7671  0.8004   
197              Logistic Regression    0.7829  0.0000  0.7829  0.8002   

         F1   Kappa     MCC  TT (Sec)                     method  
65   0.5403  0.3276  0.4319     0.684  instancehardnessthreshold  
66   0.5339  0.3240  0.4273     0.669  instancehard

# Identify the top 10 F1 scores

In [23]:
top_10_f1 = results.nlargest(10, 'F1')

print(top_10_f1)

                               Model  Accuracy     AUC  Recall   Prec.  \
197              Logistic Regression    0.7829  0.0000  0.7829  0.8002   
183              Logistic Regression    0.7743  0.0000  0.7743  0.8012   
140  Light Gradient Boosting Machine    0.7886  0.9073  0.7886  0.7845   
196  Light Gradient Boosting Machine    0.7857  0.9076  0.7857  0.7846   
155     Gradient Boosting Classifier    0.7743  0.0000  0.7743  0.7811   
200     Gradient Boosting Classifier    0.7700  0.0000  0.7700  0.7790   
240              Logistic Regression    0.7671  0.0000  0.7671  0.8004   
144              Logistic Regression    0.7671  0.0000  0.7671  0.7954   
143     Gradient Boosting Classifier    0.7686  0.0000  0.7686  0.7804   
156              Logistic Regression    0.7671  0.0000  0.7671  0.7987   

         F1   Kappa     MCC  TT (Sec)             method  
197  0.7851  0.6397  0.6445     0.915    borderlinesmote  
183  0.7797  0.6300  0.6361     0.922             adasyn  
140  0.7