# SVM Model

## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

## Load and Read the Data

In [2]:
# Load and shuffle the datasets
# Don't shuffle kaggle data
data_d1 = pd.read_json('../data/domain1_train.json', lines=True).sample(frac=1, ignore_index=True)
data_d2 = pd.read_json('../data/domain2_train.json', lines=True).sample(frac=1, ignore_index=True)
data_kaggle = pd.read_json('../data/test_set.json', lines=True)

In [3]:
data_d1

Unnamed: 0,text,label
0,"[40, 716, 17, 141, 138, 9, 0, 72, 40, 3246, 29...",1
1,"[0, 0, 804, 6, 0, 18, 0, 41, 100, 70, 0, 18, 1...",1
2,"[80, 56, 43, 55, 142, 4767, 9, 2, 195, 3, 13, ...",0
3,"[2, 837, 6, 30, 593, 3, 22, 0, 9, 82, 2261, 52...",0
4,"[532, 2, 204, 976, 2538, 336, 2, 0, 38, 5, 645...",1
...,...,...
19495,"[0, 0, 26, 2096, 0, 4, 0, 4, 1048, 18, 150, 0,...",1
19496,"[58, 14, 220, 10, 5, 3252, 799, 0, 18, 21, 215...",1
19497,"[0, 7, 0, 0, 63, 2091, 152, 62, 7, 2, 0, 0, 27...",1
19498,"[159, 3, 42, 8, 115, 28, 56, 445, 196, 0, 3, 2...",1


In [4]:
data_d1['label'].value_counts()

label
1    9750
0    9750
Name: count, dtype: int64

In [5]:
data_d2

Unnamed: 0,text,label,model
0,"[120, 25, 519, 8, 8, 17, 75, 5, 2706, 1, 180, ...",0,2.0
1,"[325, 36, 1153, 0, 9, 2426, 496, 1, 27, 10, 68...",0,6.0
2,"[0, 0, 899, 433, 0, 9, 0, 0, 8, 15, 0, 0, 3, 2...",0,0.0
3,"[0, 1457, 41, 5, 70, 0, 9, 2076, 3, 153, 4, 5,...",0,0.0
4,"[20, 300, 669, 54, 19, 19, 79, 662, 0, 249, 24...",0,3.0
...,...,...,...
14895,"[2, 98, 420, 5, 0, 1, 14, 582, 4, 93, 58, 10, ...",0,2.0
14896,"[42, 133, 4, 2, 633, 4, 116, 143, 2, 3636, 283...",0,0.0
14897,"[10, 24, 2, 2361, 1, 27, 2639, 5, 3479, 24, 5,...",0,2.0
14898,"[9, 0, 486, 47, 557, 3, 342, 5, 253, 35, 2, 36...",0,2.0


In [6]:
data_d2['label'].value_counts()

label
0    12750
1     2150
Name: count, dtype: int64

It can be seen that data from domain 2 is heavily imbalanced and needs to be balanced to build a good model.

In [7]:
data_kaggle

Unnamed: 0,id,text
0,0,"[59, 2, 3434, 1013, 823, 2, 887, 6, 2375, 0, 3..."
1,1,"[2, 785, 6, 5, 0, 317, 17, 5, 2970, 3654, 858,..."
2,2,"[3306, 4, 2, 2288, 6, 963, 94]"
3,3,"[0, 2, 1021, 38, 126, 1, 907, 1, 761, 1, 3357,..."
4,4,"[30, 488, 197, 18, 4, 31, 420, 18, 2, 3577, 6,..."
...,...,...
995,995,"[45, 88, 79, 503, 501, 134, 35, 147, 301, 0, 1..."
996,996,"[0, 0, 8, 15, 0, 810, 3, 2337, 9, 4653, 3, 41,..."
997,997,"[2, 495, 752, 17, 128, 9, 2, 3361, 6, 458, 131..."
998,998,"[2, 4433, 299, 307, 12, 100, 0, 446, 32, 44, 0..."


## Balance the Dataset

We have seen that domain 2's data is heavily imbalanced and needs to be balanced. For this I will be using random under-sampling.

In [8]:
# Separate into human and machine data
d2_humans, d2_machines = data_d2[data_d2['label'] == 1], data_d2[data_d2['label'] == 0]

In [9]:
d2_humans

Unnamed: 0,text,label,model
5,"[2, 357, 4, 120, 70, 134, 171, 5, 204, 251, 1,...",1,
9,"[465, 5, 70, 2483, 977, 1, 14, 133, 4, 21, 381...",1,
15,"[10, 2096, 33, 36, 1840, 4, 21, 172, 1, 27, 12...",1,
20,"[56, 239, 4, 31, 0, 3830, 53, 90, 2, 1745, 1, ...",1,
21,"[0, 106, 1, 2, 146, 3, 431, 3, 7, 88, 26, 203,...",1,
...,...,...,...
14870,"[5, 1747, 1762, 195, 60, 2661, 1752, 0, 4, 273...",1,
14875,"[79, 37, 1, 211, 1, 872, 1687, 18, 33, 973, 0,...",1,
14877,"[1422, 9, 0, 434, 32, 5, 828, 4, 0, 9, 18, 5, ...",1,
14879,"[20, 8, 115, 40, 261, 3051, 98, 49, 26, 44, 12...",1,


In [10]:
d2_machines

Unnamed: 0,text,label,model
0,"[120, 25, 519, 8, 8, 17, 75, 5, 2706, 1, 180, ...",0,2.0
1,"[325, 36, 1153, 0, 9, 2426, 496, 1, 27, 10, 68...",0,6.0
2,"[0, 0, 899, 433, 0, 9, 0, 0, 8, 15, 0, 0, 3, 2...",0,0.0
3,"[0, 1457, 41, 5, 70, 0, 9, 2076, 3, 153, 4, 5,...",0,0.0
4,"[20, 300, 669, 54, 19, 19, 79, 662, 0, 249, 24...",0,3.0
...,...,...,...
14895,"[2, 98, 420, 5, 0, 1, 14, 582, 4, 93, 58, 10, ...",0,2.0
14896,"[42, 133, 4, 2, 633, 4, 116, 143, 2, 3636, 283...",0,0.0
14897,"[10, 24, 2, 2361, 1, 27, 2639, 5, 3479, 24, 5,...",0,2.0
14898,"[9, 0, 486, 47, 557, 3, 342, 5, 253, 35, 2, 36...",0,2.0


In [11]:
d2_machines['model'].value_counts()

model
0.0    2364
3.0    2358
1.0    2357
2.0    2339
6.0    1763
4.0     789
5.0     780
Name: count, dtype: int64

In [12]:
# Perform a stratified sampling on the machine data wrt to LLM model
d2_machine_sample = d2_machines[['model']].sample(n=2150, random_state=42)
d2_machine_sample

Unnamed: 0,model
4682,2.0
5420,2.0
10045,3.0
6339,6.0
8551,1.0
...,...
2978,2.0
14077,1.0
1128,3.0
4511,4.0


In [13]:
d2_machine_sample['model'].value_counts()

model
2.0    411
0.0    396
3.0    387
1.0    385
6.0    301
4.0    148
5.0    122
Name: count, dtype: int64

In [14]:
# Extract the indices of the sampled machine data
d2_machine_sample_index = d2_machine_sample.index

# Extract the rows with the indices
d2_machines = data_d2.iloc[d2_machine_sample_index, :]

In [15]:
d2_machines

Unnamed: 0,text,label,model
4682,"[1439, 38, 4566, 33, 0, 3, 28, 0, 1, 25, 1849,...",0,2.0
5420,"[158, 4, 1140, 109, 4, 2317, 1, 27, 128, 5, 26...",0,2.0
10045,"[20, 32, 5, 0, 448, 1, 30, 707, 20, 77, 457, 5...",0,3.0
6339,"[10, 125, 4, 957, 36, 188, 52, 6, 575, 1, 27, ...",0,6.0
8551,"[17, 664, 129, 1224, 21, 1847, 24, 193, 1, 14,...",0,1.0
...,...,...,...
2978,"[25, 11, 8, 83, 367, 3, 29, 11, 32, 4, 101, 16...",0,2.0
14077,"[158, 5, 70, 0, 1, 10, 301, 662, 16, 5, 70, 0,...",0,1.0
1128,"[10, 125, 4, 2, 1525, 1, 14, 10, 127, 629, 61,...",0,3.0
4511,"[16, 39, 0, 922, 3, 39, 241, 162, 4, 1483, 57,...",0,4.0


In [16]:
# Combine the human and machine data to form the new data_d2
data_d2 = pd.concat([d2_humans, d2_machines], ignore_index=True)
data_d2

Unnamed: 0,text,label,model
0,"[2, 357, 4, 120, 70, 134, 171, 5, 204, 251, 1,...",1,
1,"[465, 5, 70, 2483, 977, 1, 14, 133, 4, 21, 381...",1,
2,"[10, 2096, 33, 36, 1840, 4, 21, 172, 1, 27, 12...",1,
3,"[56, 239, 4, 31, 0, 3830, 53, 90, 2, 1745, 1, ...",1,
4,"[0, 106, 1, 2, 146, 3, 431, 3, 7, 88, 26, 203,...",1,
...,...,...,...
4295,"[25, 11, 8, 83, 367, 3, 29, 11, 32, 4, 101, 16...",0,2.0
4296,"[158, 5, 70, 0, 1, 10, 301, 662, 16, 5, 70, 0,...",0,1.0
4297,"[10, 125, 4, 2, 1525, 1, 14, 10, 127, 629, 61,...",0,3.0
4298,"[16, 39, 0, 922, 3, 39, 241, 162, 4, 1483, 57,...",0,4.0


In [17]:
data_d2['label'].value_counts()

label
1    2150
0    2150
Name: count, dtype: int64

The dataset is now balanced with 2150 human and machine texts each.

## Separate Features and Labels

In [18]:
d1_features = data_d1['text']
d1_labels = data_d1['label']

d2_features = data_d2['text']
d2_labels = data_d2['label']

kaggle_features = data_kaggle['text']

## Create Bag of Words

In [19]:
def vector_to_bow(text_data):
    # Create an empty array to store the word counts
    sample_size = text_data.shape[0]
    bow = np.zeros(shape=(sample_size, 5000), dtype=int)

    # Iterate through every text
    for i, txt in enumerate(text_data):
        # For each word in the text, increase the word count by 1
        for word in txt:
            bow[i, word] += 1

    # Return bag of words
    return bow

In [20]:
# Transform the feature vectors to bow
d1_bow = vector_to_bow(d1_features)
d2_bow = vector_to_bow(d2_features)
kaggle_bow = vector_to_bow(kaggle_features)

In [21]:
d1_bow

array([[ 2,  1,  1, ...,  0,  0,  0],
       [ 9,  1,  2, ...,  0,  0,  0],
       [ 1,  1,  3, ...,  0,  0,  0],
       ...,
       [ 6,  0,  2, ...,  0,  0,  0],
       [ 1,  1,  0, ...,  0,  0,  0],
       [15,  0,  0, ...,  0,  0,  0]])

In [22]:
d2_bow

array([[1, 5, 4, ..., 0, 0, 0],
       [6, 5, 3, ..., 0, 0, 0],
       [1, 5, 0, ..., 0, 0, 0],
       ...,
       [1, 5, 4, ..., 0, 0, 0],
       [2, 4, 2, ..., 0, 0, 0],
       [8, 1, 0, ..., 0, 0, 0]])

In [23]:
kaggle_bow

array([[32, 14, 14, ...,  0,  0,  0],
       [ 2, 11,  4, ...,  0,  0,  0],
       [ 0,  0,  1, ...,  0,  0,  0],
       ...,
       [ 0,  0,  3, ...,  0,  0,  0],
       [ 7,  1,  3, ...,  0,  0,  0],
       [27, 18, 15, ...,  0,  0,  0]])

In [24]:
# Create a combined count_vector
combined_bow = np.vstack([d1_bow, d2_bow])
combined_labels = pd.concat([d1_labels, d2_labels], ignore_index=True)

In [25]:
combined_bow

array([[2, 1, 1, ..., 0, 0, 0],
       [9, 1, 2, ..., 0, 0, 0],
       [1, 1, 3, ..., 0, 0, 0],
       ...,
       [1, 5, 4, ..., 0, 0, 0],
       [2, 4, 2, ..., 0, 0, 0],
       [8, 1, 0, ..., 0, 0, 0]])

## PCA Dimensionality Reduction

In [26]:
# Initialise a PCA instance
# Fit to combined data
# Transform each dataset
pca = PCA(n_components=400)
pca.fit(combined_bow)

combined_pca = pca.transform(combined_bow)
d1_pca = combined_pca[:19500, :]
d2_pca = combined_pca[19500:, :]
kaggle_pca = pca.transform(kaggle_bow)

In [27]:
print(d1_pca.shape)
print(d2_pca.shape)

(19500, 400)
(4300, 400)


## SVM Classifier

In [28]:
# Build an SVC model
svc = SVC(C=10, kernel='rbf', gamma=0.001, random_state=42, probability=True)

In [29]:
# Fit on domain 1 data
svc.fit(d1_pca, d1_labels)

In [30]:
# Fine tune on domain 2 data
svc.fit(d2_pca, d2_labels)

In [31]:
# Fine tune on combined data
svc.fit(combined_pca, combined_labels)

In [32]:
# Check training accuracy
comb_preds = svc.predict(combined_pca)
accuracy_score(combined_labels, comb_preds)

0.900672268907563

In [33]:
# Predict on kaggle data
test_preds = svc.predict(kaggle_pca)
output = pd.DataFrame({'id': range(len(test_preds)), 'class': test_preds})
output.to_csv('Outputs/svc_output.csv', index=False)

In [34]:
# Check the balance in the predictions
output['class'].value_counts()

class
0    511
1    489
Name: count, dtype: int64

### Second-best with pred: 0.796

In [35]:
# Get the predict probs on kaggle data
probs = svc.predict_proba(kaggle_pca)[:, 1]
prob_output = pd.DataFrame({'id': range(len(probs)), 'probs': probs})
prob_output.to_csv('Outputs/svc_probs.csv', index = False)