In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.utils import class_weight
import pickle
import cv2
import joblib
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, auc, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [2]:
train = pd.read_json("./train.jsonl",lines=True)
test = pd.read_json("./test.jsonl",lines=True)
valid = pd.read_json("./dev.jsonl",lines=True)

In [3]:
y_train = np.array(train['label'])
y_train = np.delete(y_train, [9, 327, 1753, 6471], axis=0)
x_train = np.load('train_textfeat_glove.npy')
print(x_train.shape, y_train.shape)

(8496, 300) (8496,)


In [4]:
x_test = np.load('valid_textfeat_glove.npy')
y_test = np.array(valid['label'])
y_test = np.delete(y_test, [198], axis=0)
print(x_test.shape, y_test.shape)

(499, 300) (499,)


In [5]:
def LR(xtrain,xtest,ytrain,ytest):
    log_model = LogisticRegression()
    log_model.fit(xtrain, ytrain)
    y_pred_log = log_model.predict(xtest)
    print(confusion_matrix(ytest, y_pred_log))
    print(classification_report(ytest,y_pred_log))
    #print(roc_auc_score(ytest,y_pred_log))
    fpr, tpr, thresholds = roc_curve(ytest,y_pred_log)
    print(auc(fpr, tpr))
def MLP(xtrain,xtest,ytrain,ytest):
    log_model = xgb.XGBClassifier()
    log_model.fit(xtrain, ytrain)
    y_pred_log = log_model.predict(xtest)
    print(confusion_matrix(ytest, y_pred_log))
    print(classification_report(ytest,y_pred_log))
def LGBM(xtrain,xtest,ytrain,ytest):
    lgbm_model = lgb.LGBMClassifier(n_estimators=1000,max_depth=5)
    lgbm_model.fit(xtrain, ytrain)
    y_pred_lgbm = lgbm_model.predict(xtest)
    print(confusion_matrix(ytest, y_pred_lgbm))
    print(classification_report(ytest,y_pred_lgbm))
    #print(roc_auc_score(ytest,y_pred_lgbm))
    fpr, tpr, thresholds = roc_curve(ytest,y_pred_lgbm)
    print(auc(fpr, tpr))
    #print(auc(ytest,y_pred_lgbm))

In [6]:
print('Logistic Regression')
LR(x_train,x_test,y_train,y_test)
print('LightGBM')
LGBM(x_train,x_test,y_train,y_test)

Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[218  32]
 [206  43]]
              precision    recall  f1-score   support

           0       0.51      0.87      0.65       250
           1       0.57      0.17      0.27       249

    accuracy                           0.52       499
   macro avg       0.54      0.52      0.46       499
weighted avg       0.54      0.52      0.46       499

0.5223453815261044
LightGBM
[[230  20]
 [211  38]]
              precision    recall  f1-score   support

           0       0.52      0.92      0.67       250
           1       0.66      0.15      0.25       249

    accuracy                           0.54       499
   macro avg       0.59      0.54      0.46       499
weighted avg       0.59      0.54      0.46       499

0.5363052208835342


# Over-Sampling

In [7]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(x_train, y_train)
print('Logistic Regression')
LR(X_ros,x_test,y_ros,y_test)
print('LightGBM')
LGBM(X_ros,x_test,y_ros,y_test)

Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[160  90]
 [133 116]]
              precision    recall  f1-score   support

           0       0.55      0.64      0.59       250
           1       0.56      0.47      0.51       249

    accuracy                           0.55       499
   macro avg       0.55      0.55      0.55       499
weighted avg       0.55      0.55      0.55       499

0.5529317269076306
LightGBM
[[219  31]
 [198  51]]
              precision    recall  f1-score   support

           0       0.53      0.88      0.66       250
           1       0.62      0.20      0.31       249

    accuracy                           0.54       499
   macro avg       0.57      0.54      0.48       499
weighted avg       0.57      0.54      0.48       499

0.5404096385542169


# Under Sampling

In [8]:
print('Random Over-Sampling')
ros = RandomUnderSampler()
X_ros, y_ros = ros.fit_resample(x_train, y_train)
print('Logistic Regression')
LR(X_ros,x_test,y_ros,y_test)
print('LightGBM')
LGBM(X_ros,x_test,y_ros,y_test)

Random Over-Sampling
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[163  87]
 [135 114]]
              precision    recall  f1-score   support

           0       0.55      0.65      0.59       250
           1       0.57      0.46      0.51       249

    accuracy                           0.56       499
   macro avg       0.56      0.55      0.55       499
weighted avg       0.56      0.56      0.55       499

0.5549156626506024
LightGBM
[[182  68]
 [154  95]]
              precision    recall  f1-score   support

           0       0.54      0.73      0.62       250
           1       0.58      0.38      0.46       249

    accuracy                           0.56       499
   macro avg       0.56      0.55      0.54       499
weighted avg       0.56      0.56      0.54       499

0.5547630522088354


# Text Features Results

In [9]:
x_train = np.load('train_lxmert_language_avg.npy')
x_test = np.load('dev_lxmert_language_avg.npy')
print(x_train.shape, x_test.shape)

(8496, 768) (499, 768)


# No Sampling

In [10]:
print('Logistic Regression')
LR(x_train,x_test,y_train,y_test)
print('LightGBM')
LGBM(x_train,x_test,y_train,y_test)

Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[208  42]
 [180  69]]
              precision    recall  f1-score   support

           0       0.54      0.83      0.65       250
           1       0.62      0.28      0.38       249

    accuracy                           0.56       499
   macro avg       0.58      0.55      0.52       499
weighted avg       0.58      0.56      0.52       499

0.5545542168674699
LightGBM
[[232  18]
 [204  45]]
              precision    recall  f1-score   support

           0       0.53      0.93      0.68       250
           1       0.71      0.18      0.29       249

    accuracy                           0.56       499
   macro avg       0.62      0.55      0.48       499
weighted avg       0.62      0.56      0.48       499

0.5543614457831326


# Over Sampling

In [11]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(x_train, y_train)
print('Logistic Regression')
LR(X_ros,x_test,y_ros,y_test)
print('LightGBM')
LGBM(X_ros,x_test,y_ros,y_test)

Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[157  93]
 [147 102]]
              precision    recall  f1-score   support

           0       0.52      0.63      0.57       250
           1       0.52      0.41      0.46       249

    accuracy                           0.52       499
   macro avg       0.52      0.52      0.51       499
weighted avg       0.52      0.52      0.51       499

0.5188192771084337
LightGBM
[[219  31]
 [203  46]]
              precision    recall  f1-score   support

           0       0.52      0.88      0.65       250
           1       0.60      0.18      0.28       249

    accuracy                           0.53       499
   macro avg       0.56      0.53      0.47       499
weighted avg       0.56      0.53      0.47       499

0.5303694779116466


# Under Sampling

In [12]:
print('Random Over-Sampling')
ros = RandomUnderSampler()
X_ros, y_ros = ros.fit_resample(x_train, y_train)
print('Logistic Regression')
LR(X_ros,x_test,y_ros,y_test)
print('LightGBM')
LGBM(X_ros,x_test,y_ros,y_test)

Random Over-Sampling
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[141 109]
 [133 116]]
              precision    recall  f1-score   support

           0       0.51      0.56      0.54       250
           1       0.52      0.47      0.49       249

    accuracy                           0.52       499
   macro avg       0.52      0.51      0.51       499
weighted avg       0.52      0.52      0.51       499

0.5149317269076306
LightGBM
[[155  95]
 [140 109]]
              precision    recall  f1-score   support

           0       0.53      0.62      0.57       250
           1       0.53      0.44      0.48       249

    accuracy                           0.53       499
   macro avg       0.53      0.53      0.53       499
weighted avg       0.53      0.53      0.53       499

0.5288755020080321


# Combined Image + Text dataset

In [44]:
x1 = np.load('train_img_feat_clip.npy')
x2 = np.load('train_text_feat_clip.npy')
x_train = np.concatenate([x1, x2], axis=1)
print(x_train.shape)

(8496, 1024)


In [45]:
x1 = np.load('valid_img_feat_clip.npy')
x2 = np.load('valid_text_feat_clip.npy')
x_test = np.concatenate([x1, x2], axis=1)
print(x_test.shape)

(500, 1024)


In [13]:
x_train = np.load('train_lxmert_pool.npy')
x_test = np.load('dev_lxmert_pool.npy')
print(x_train.shape, x_test.shape)

(8496, 768) (499, 768)


# No Sampling

In [14]:
print('Logistic Regression')
LR(x_train,x_test,y_train,y_test)
print('LightGBM')
LGBM(x_train,x_test,y_train,y_test)

Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[202  48]
 [188  61]]
              precision    recall  f1-score   support

           0       0.52      0.81      0.63       250
           1       0.56      0.24      0.34       249

    accuracy                           0.53       499
   macro avg       0.54      0.53      0.49       499
weighted avg       0.54      0.53      0.49       499

0.5264899598393574
LightGBM
[[215  35]
 [205  44]]
              precision    recall  f1-score   support

           0       0.51      0.86      0.64       250
           1       0.56      0.18      0.27       249

    accuracy                           0.52       499
   macro avg       0.53      0.52      0.46       499
weighted avg       0.53      0.52      0.46       499

0.5183534136546184


# Over-Sampling

In [15]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(x_train, y_train)
print('Logistic Regression')
LR(X_ros,x_test,y_ros,y_test)
print('LightGBM')
LGBM(X_ros,x_test,y_ros,y_test)

Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[157  93]
 [132 117]]
              precision    recall  f1-score   support

           0       0.54      0.63      0.58       250
           1       0.56      0.47      0.51       249

    accuracy                           0.55       499
   macro avg       0.55      0.55      0.55       499
weighted avg       0.55      0.55      0.55       499

0.5489397590361447
LightGBM
[[214  36]
 [192  57]]
              precision    recall  f1-score   support

           0       0.53      0.86      0.65       250
           1       0.61      0.23      0.33       249

    accuracy                           0.54       499
   macro avg       0.57      0.54      0.49       499
weighted avg       0.57      0.54      0.49       499

0.5424578313253011


# Under-Sampling

In [16]:
print('Random Over-Sampling')
ros = RandomUnderSampler()
X_ros, y_ros = ros.fit_resample(x_train, y_train)
print('Logistic Regression')
LR(X_ros,x_test,y_ros,y_test)
print('LightGBM')
LGBM(X_ros,x_test,y_ros,y_test)

Random Over-Sampling
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[146 104]
 [126 123]]
              precision    recall  f1-score   support

           0       0.54      0.58      0.56       250
           1       0.54      0.49      0.52       249

    accuracy                           0.54       499
   macro avg       0.54      0.54      0.54       499
weighted avg       0.54      0.54      0.54       499

0.5389879518072289
LightGBM
[[139 111]
 [118 131]]
              precision    recall  f1-score   support

           0       0.54      0.56      0.55       250
           1       0.54      0.53      0.53       249

    accuracy                           0.54       499
   macro avg       0.54      0.54      0.54       499
weighted avg       0.54      0.54      0.54       499

0.5410522088353413


In [17]:
print(np.sum(y_train==1), np.sum(y_train==0))

3050 5446
