In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
train_labels = pd.read_csv('data/train.csv')

In [3]:
train_labels.head()

Unnamed: 0,id,real_text_id
0,0,1
1,1,2
2,2,1
3,3,2
4,4,2


In [4]:
train_target = []

for i in train_labels.real_text_id.values:
    if i==1:
        train_target.extend([1, 0])
    else:
        train_target.extend([0, 1])

In [5]:
len(train_target)

190

In [6]:
train_articles = []
test_articles = []

for t in os.listdir('data'):
    try:
        for a in os.listdir(f'data/{t}'):
            for f in os.listdir(f'data/{t}/{a}'):
                with open(f'data/{t}/{a}/{f}', 'r', encoding = 'utf-8') as file:
                    if t == 'train':
                        train_articles.append(file.read())
                    elif t == 'test':
                        test_articles.append(file.read())
    except:
        continue

In [7]:
len(train_articles)

190

In [8]:
len(test_articles)

2136

In [9]:
train_df = pd.DataFrame(data = list(zip(train_articles, train_target)), columns = ['article', 'target'])

In [10]:
train_df

Unnamed: 0,article,target
0,The VIRSA (Visible Infrared Survey Telescope A...,1
1,The China relay network has released a signifi...,0
2,China\nThe goal of this project involves achie...,0
3,The project aims to achieve an accuracy level ...,1
4,Scientists can learn about how galaxies form a...,1
...,...,...
185,FORS1 and FORS2 are early instruments of the V...,1
186,The observations of the Pluto-Charon system an...,0
187,The observations of the Pluto-Charon binary an...,1
188,The new detector system was first tested on 30...,1


In [11]:
X = train_df.article

In [12]:
Y = train_df.target

In [13]:
Z = test_articles

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf = TfidfVectorizer()

In [16]:
X = tfidf.fit_transform(X)
Z = tfidf.transform(Z)

In [17]:
X.shape

(190, 9592)

In [18]:
Y.shape

(190,)

In [19]:
Z.shape

(2136, 9592)

In [20]:
from keras import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Activation

In [21]:
from keras.optimizers import Adam

In [22]:
from keras import regularizers

In [23]:
from tensorflow.keras.callbacks import EarlyStopping

In [63]:
early_stop = EarlyStopping(monitor = 'val_loss', patience=5, restore_best_weights=True)

In [102]:
model = Sequential()

model.add(Dense(units = 128, input_shape = (9592, )))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dense(units = 64))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dense(units = 32))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dense(units = 1, activation = 'sigmoid'))

In [103]:
model.compile(optimizer = Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

In [104]:
model.fit(X, Y, validation_split=0.1, epochs=20, batch_size=16, callbacks=[early_stop])

Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.5003 - loss: 0.8041 - val_accuracy: 0.5789 - val_loss: 0.6827
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8034 - loss: 0.4537 - val_accuracy: 0.5263 - val_loss: 0.6814
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9372 - loss: 0.2751 - val_accuracy: 0.5263 - val_loss: 0.6789
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9806 - loss: 0.2202 - val_accuracy: 0.5263 - val_loss: 0.6759
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9733 - loss: 0.2412 - val_accuracy: 0.5263 - val_loss: 0.6733
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9901 - loss: 0.1471 - val_accuracy: 0.5263 - val_loss: 0.6702
Epoch 7/20
[1m11/11[0m [32m━━━━

<keras.src.callbacks.history.History at 0x20af74826d0>

In [185]:
pred = model.predict(Z)

[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [186]:
pred[:10]

array([[0.9995828 ],
       [0.9188599 ],
       [0.9974431 ],
       [0.9569613 ],
       [0.84263337],
       [0.9943666 ],
       [0.0119203 ],
       [0.03448798],
       [0.01416155],
       [0.8027579 ]], dtype=float32)

In [187]:
len(pred)

2136

In [188]:
pred[0]

array([0.9995828], dtype=float32)

In [189]:
sub_pred = []

for i in range(0, len(pred), 2):
    if pred[i]>=pred[i+1]:
        sub_pred.append(1)
    else:
        sub_pred.append(2)

In [190]:
sub_df = pd.DataFrame(data = list(zip([i for i in range(0, 1068)], sub_pred)), columns = ['id', 'real_text_id'])

In [191]:
sub_df

Unnamed: 0,id,real_text_id
0,0,1
1,1,1
2,2,2
3,3,2
4,4,2
...,...,...
1063,1063,2
1064,1064,1
1065,1065,1
1066,1066,2


In [192]:
sub_df.to_csv('sub8.csv', index=False, index_label=False)