# LightGBM Model

## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

## Load and Read the Data

In [2]:
# Load and shuffle the datasets
# Don't shuffle kaggle data
data_d1 = pd.read_json('../data/domain1_train.json', lines=True).sample(frac=1, ignore_index=True)
data_d2 = pd.read_json('../data/domain2_train.json', lines=True).sample(frac=1, ignore_index=True)
data_kaggle = pd.read_json('../data/test_set.json', lines=True)

In [3]:
data_d1

Unnamed: 0,text,label
0,"[5, 249, 6, 332, 9, 308, 0, 0, 2, 98, 3, 153, ...",1
1,"[22, 5, 624, 3, 2, 48, 6, 213, 278, 12, 2, 104...",0
2,"[382, 0, 0, 0, 3, 49, 1357, 2, 1036, 7, 157, 5...",1
3,"[2330, 2, 189, 48, 78, 28, 3688, 217, 30, 3031...",0
4,"[5, 1448, 6, 0, 666, 43, 325, 202, 2, 1152, 6,...",1
...,...,...
19495,"[29, 2, 224, 6, 2, 48, 6, 213, 17, 1163, 0, 22...",0
19496,"[285, 1, 12, 5, 167, 789, 19, 5, 847, 3, 49, 1...",0
19497,"[0, 1498, 548, 1, 0, 897, 2, 488, 16, 407, 1, ...",0
19498,"[2, 1439, 32, 44, 9, 2, 2700, 4460, 3, 0, 2613...",1


In [4]:
data_d1['label'].value_counts()

label
1    9750
0    9750
Name: count, dtype: int64

In [5]:
data_d2

Unnamed: 0,text,label,model
0,"[20, 38, 5, 0, 540, 9, 5, 339, 4968, 1, 20, 26...",0,6.0
1,"[466, 0, 4897, 35, 5, 2446, 930, 1028, 1, 27, ...",0,6.0
2,"[194, 66, 20, 173, 157, 4584, 38, 2782, 74, 4,...",0,3.0
3,"[2, 98, 2096, 21, 258, 4, 1013, 1, 21, 258, 78...",1,
4,"[307, 42, 26, 120, 188, 0, 1, 11, 0, 0, 7, 0, ...",0,5.0
...,...,...,...
14895,"[1446, 4, 2966, 2, 98, 49, 547, 453, 198, 98, ...",0,1.0
14896,"[1122, 5, 250, 61, 0, 0, 1, 14, 162, 14, 158, ...",0,3.0
14897,"[180, 69, 45, 6, 79, 4772, 4456, 1928, 3, 5, 2...",0,2.0
14898,"[30, 227, 3, 1398, 324, 64, 0, 18, 21, 84, 123...",0,3.0


In [6]:
data_d2['label'].value_counts()

label
0    12750
1     2150
Name: count, dtype: int64

It can be seen that data from domain 2 is heavily imbalanced and needs to be balanced to build a good model.

In [7]:
data_kaggle

Unnamed: 0,id,text
0,0,"[59, 2, 3434, 1013, 823, 2, 887, 6, 2375, 0, 3..."
1,1,"[2, 785, 6, 5, 0, 317, 17, 5, 2970, 3654, 858,..."
2,2,"[3306, 4, 2, 2288, 6, 963, 94]"
3,3,"[0, 2, 1021, 38, 126, 1, 907, 1, 761, 1, 3357,..."
4,4,"[30, 488, 197, 18, 4, 31, 420, 18, 2, 3577, 6,..."
...,...,...
995,995,"[45, 88, 79, 503, 501, 134, 35, 147, 301, 0, 1..."
996,996,"[0, 0, 8, 15, 0, 810, 3, 2337, 9, 4653, 3, 41,..."
997,997,"[2, 495, 752, 17, 128, 9, 2, 3361, 6, 458, 131..."
998,998,"[2, 4433, 299, 307, 12, 100, 0, 446, 32, 44, 0..."


## Balance the Dataset

We have seen that domain 2's data is heavily imbalanced and needs to be balanced. For this I will be using custom random under-sampling.

In [8]:
# Separate into human and machine data
d2_humans, d2_machines = data_d2[data_d2['label'] == 1], data_d2[data_d2['label'] == 0]

In [9]:
d2_humans

Unnamed: 0,text,label,model
3,"[2, 98, 2096, 21, 258, 4, 1013, 1, 21, 258, 78...",1,
10,"[72, 79, 216, 3, 20, 1410, 9, 5, 302, 0, 228, ...",1,
12,"[2, 48, 678, 10, 272, 16, 585, 30, 431, 1, 11,...",1,
19,"[0, 47, 253, 234, 511, 486, 557, 12, 572, 3451...",1,
20,"[10, 3666, 36, 172, 1, 27, 10, 2397, 2, 0, 0, ...",1,
...,...,...,...
14874,"[10, 18, 36, 117, 678, 35, 36, 0, 3228, 1, 59,...",1,
14880,"[133, 4, 2, 1525, 1839, 1, 59, 27, 0, 27, 26, ...",1,
14881,"[4273, 145, 21, 2020, 9, 21, 0, 70, 0, 1, 14, ...",1,
14892,"[20, 38, 0, 0, 19, 5, 1577, 0, 7, 0, 1300, 380...",1,


In [10]:
d2_machines

Unnamed: 0,text,label,model
0,"[20, 38, 5, 0, 540, 9, 5, 339, 4968, 1, 20, 26...",0,6.0
1,"[466, 0, 4897, 35, 5, 2446, 930, 1028, 1, 27, ...",0,6.0
2,"[194, 66, 20, 173, 157, 4584, 38, 2782, 74, 4,...",0,3.0
4,"[307, 42, 26, 120, 188, 0, 1, 11, 0, 0, 7, 0, ...",0,5.0
5,"[20, 8, 154, 300, 26, 383, 1, 20, 428, 322, 38...",0,6.0
...,...,...,...
14895,"[1446, 4, 2966, 2, 98, 49, 547, 453, 198, 98, ...",0,1.0
14896,"[1122, 5, 250, 61, 0, 0, 1, 14, 162, 14, 158, ...",0,3.0
14897,"[180, 69, 45, 6, 79, 4772, 4456, 1928, 3, 5, 2...",0,2.0
14898,"[30, 227, 3, 1398, 324, 64, 0, 18, 21, 84, 123...",0,3.0


In [11]:
d2_machines['model'].value_counts()

model
0.0    2364
3.0    2358
1.0    2357
2.0    2339
6.0    1763
4.0     789
5.0     780
Name: count, dtype: int64

In [12]:
# Perform a stratified sampling on the machine data wrt to LLM model
d2_machine_sample = d2_machines[['model']].sample(n=2150, random_state=42)
d2_machine_sample

Unnamed: 0,model
4648,0.0
5391,0.0
10030,3.0
6310,4.0
8504,1.0
...,...
2905,0.0
14072,6.0
1111,0.0
4474,1.0


In [13]:
d2_machine_sample['model'].value_counts()

model
1.0    429
0.0    424
3.0    393
2.0    379
6.0    258
5.0    135
4.0    132
Name: count, dtype: int64

In [14]:
# Extract the indices of the sampled machine data
d2_machine_sample_index = d2_machine_sample.index

# Extract the rows with the indices
d2_machines = data_d2.iloc[d2_machine_sample_index, :]

In [15]:
d2_machines

Unnamed: 0,text,label,model
4648,"[25, 1406, 3, 11, 111, 83, 79, 3578, 3, 8, 8, ...",0,0.0
5391,"[243, 163, 10, 536, 19, 21, 231, 1, 14, 214, 2...",0,0.0
10030,"[0, 0, 1418, 8, 4733, 3612, 4, 2, 0, 1393, 351...",0,3.0
6310,"[594, 466, 5, 3455, 6, 2943, 4, 116, 141, 120,...",0,4.0
8504,"[623, 86, 5, 70, 1098, 1, 10, 46, 8, 37, 239, ...",0,1.0
...,...,...,...
2905,"[594, 466, 5, 3455, 6, 2943, 4, 116, 141, 120,...",0,0.0
14072,"[5, 1091, 272, 6, 3232, 32, 2, 1012, 4, 3240, ...",0,6.0
1111,"[18, 2, 138, 4, 193, 3, 208, 5, 512, 689, 9, 2...",0,0.0
4474,"[2, 3539, 42, 32, 3, 29, 3, 28, 2, 1109, 12, 5...",0,1.0


In [16]:
# Combine the human and machine data to form the new data_d2
data_d2 = pd.concat([d2_humans, d2_machines], ignore_index=True)
data_d2

Unnamed: 0,text,label,model
0,"[2, 98, 2096, 21, 258, 4, 1013, 1, 21, 258, 78...",1,
1,"[72, 79, 216, 3, 20, 1410, 9, 5, 302, 0, 228, ...",1,
2,"[2, 48, 678, 10, 272, 16, 585, 30, 431, 1, 11,...",1,
3,"[0, 47, 253, 234, 511, 486, 557, 12, 572, 3451...",1,
4,"[10, 3666, 36, 172, 1, 27, 10, 2397, 2, 0, 0, ...",1,
...,...,...,...
4295,"[594, 466, 5, 3455, 6, 2943, 4, 116, 141, 120,...",0,0.0
4296,"[5, 1091, 272, 6, 3232, 32, 2, 1012, 4, 3240, ...",0,6.0
4297,"[18, 2, 138, 4, 193, 3, 208, 5, 512, 689, 9, 2...",0,0.0
4298,"[2, 3539, 42, 32, 3, 29, 3, 28, 2, 1109, 12, 5...",0,1.0


In [17]:
data_d2['label'].value_counts()

label
1    2150
0    2150
Name: count, dtype: int64

The dataset is now balanced with 2150 human and machine texts each.

## Separate Features and Labels

In [18]:
d1_features = data_d1['text']
d1_labels = data_d1['label']

d2_features = data_d2['text']
d2_labels = data_d2['label']

kaggle_features = data_kaggle['text']

## Create Bag of Words

In [19]:
def vector_to_bow(text_data):
    # Create an empty array to store the word counts
    sample_size = text_data.shape[0]
    bow = np.zeros(shape=(sample_size, 5000), dtype=int)

    # Iterate through every text
    for i, txt in enumerate(text_data):
        # For each word in the text, increase the word count by 1
        for word in txt:
            bow[i, word] += 1

    # Return bag of words
    return bow

In [20]:
# Transform the feature vectors to bow
d1_bow = vector_to_bow(d1_features)
d2_bow = vector_to_bow(d2_features)
kaggle_bow = vector_to_bow(kaggle_features)

In [21]:
d1_bow

array([[4, 1, 1, ..., 0, 0, 0],
       [0, 1, 2, ..., 0, 0, 0],
       [6, 1, 1, ..., 0, 0, 0],
       ...,
       [6, 7, 2, ..., 0, 0, 0],
       [5, 1, 2, ..., 0, 0, 0],
       [9, 4, 8, ..., 0, 0, 0]])

In [22]:
d2_bow

array([[ 1,  5,  5, ...,  0,  0,  0],
       [22, 20,  7, ...,  0,  0,  0],
       [ 0,  5,  4, ...,  0,  0,  0],
       ...,
       [ 1,  5,  6, ...,  0,  0,  0],
       [ 6,  9,  3, ...,  0,  0,  0],
       [29, 14, 10, ...,  0,  0,  0]])

In [23]:
kaggle_bow

array([[32, 14, 14, ...,  0,  0,  0],
       [ 2, 11,  4, ...,  0,  0,  0],
       [ 0,  0,  1, ...,  0,  0,  0],
       ...,
       [ 0,  0,  3, ...,  0,  0,  0],
       [ 7,  1,  3, ...,  0,  0,  0],
       [27, 18, 15, ...,  0,  0,  0]])

In [24]:
# Create a combined bow
combined_bow = np.vstack([d1_bow, d2_bow])
combined_labels = pd.concat([d1_labels, d2_labels], ignore_index=True)

## LGBM Classifier with Bag of Words Model

In [25]:
# Build an intial LGBM model
lgbm1 = LGBMClassifier(objective= 'binary', metric= 'binary_logloss', boosting= 'gbdt', num_boost_round= 200, learning_rate= 0.1, num_leaves= 50, n_jobs= -1, reg_lambda= 1)

In [26]:
lgbm1.fit(d1_bow, d1_labels)





In [27]:
# Build the next LGBM model
lgbm2 = LGBMClassifier(objective= 'binary', metric= 'binary_logloss', boosting= 'dart', num_boost_round= 200, learning_rate= 0.01, num_leaves= 50, n_jobs= -1, reg_lambda= 1)

In [28]:
# Fine tune the model using lgbm1's weights on domain 2 data
lgbm2.fit(d2_bow, d2_labels, init_model=lgbm1)



In [29]:
# Fine tune the final model on the combined data
lgbm2.fit(combined_bow, combined_labels)



In [30]:
# Check final train predictions
comb_preds = lgbm2.predict(combined_bow)
accuracy_score(combined_labels, comb_preds)

0.8713865546218488

In [31]:
# Predict on kaggle data and save the outputs
test_preds = lgbm2.predict(kaggle_bow)
output = pd.DataFrame({'id': range(len(test_preds)), 'class': test_preds})
output.to_csv('Outputs/lgbm_output_1.csv', index=False)

In [32]:
# Check the balance in the predictions
output['class'].value_counts()

class
1    507
0    493
Name: count, dtype: int64

### Best classifier with public leaderboard pred: 0.80

In [36]:
# Get the predict probs on kaggle data
probs = lgbm2.predict_proba(kaggle_bow)[:, 1]
prob_output = pd.DataFrame({'id': range(len(probs)), 'probs': probs})
prob_output.to_csv('Outputs/lgbm_probs.csv', index = False)