# Import libraries and read the test file

In [1]:
import pandas as pd
from tensorflow.keras.models import load_model

from google.colab import drive
drive.mount('/content/drive')

data_test = pd.read_csv("/content/drive/My Drive/995,000_rows_preprocessed_test.csv", usecols=["type", "content"]) #Read the CSV file

model = load_model("/content/drive/My Drive/my_model_v2.keras") #Load the model
print(data_test['content'])

Mounted at /content/drive
0        ['``', 'subobject', "''", 'predefin', 'propert...
1        ['decis', 'import', 'liquefi', 'natur', 'ga', ...
2        ['cnn', 'get', 'faa', 'waiver', 'fli', 'drone'...
3        ['headlin', ':', 'work', 'american', 'stand', ...
4        ['tom', 'clanci', '’', 'divis', 'best', 'day',...
                               ...                        
51022    ['planet', 'x', 'visibl', ',', 'may', '<num>',...
51023    ['<num>', ',', 'time', 'life', 'jade', ',', 'w...
51024    ['president-elect', 'donald', 'trump', '’', 'n...
51025    ['search', 'properti', 'list', 'page', 'proper...
51026    ['maureen', 'dowd', 'screen', '“', 'fair', 'ga...
Name: content, Length: 51027, dtype: object


# Classify our types into binary. We use 1 for reliable news, 0 for fake news. Check for missing values and them to 0.0

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset (assuming df is your DataFrame)
#df = data.dropna(subset=['type', 'content']).copy()

def Classificationtype(df):
  df['type'] = df['type'].map({
      'unreliable': 0.0,
      'fake': 0.0,
      'clickbait': 1.0,
      'conspiracy': 0.0,
      'bias': 0.0,
      'hate': 0.0,
      'junksci': 0.0,
      'political': 1.0,
      'unknown': 0.0,
      'reliable': 1.0
  })

  # Replace NaN values with "unknown"
  df.fillna(0.0, inplace=True)

  # Verify the changes
  df.info()


Classificationtype(data_test)


print(sum(data_test['type']))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51027 entries, 0 to 51026
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   type     51027 non-null  float64
 1   content  51027 non-null  object 
dtypes: float64(1), object(1)
memory usage: 797.4+ KB
15085.0


#Import the tokenizer and apply to test data


In [9]:
import pickle

# Load the tokenizer from the file
# Update the path to include your Drive location
with open('/content/drive/My Drive/tokenizer.pkl', 'rb') as f:  # Make sure this line is not indented
    tokenizer = pickle.load(f)  # This line should be indented

print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [10]:
X_test_seq = tokenizer.texts_to_sequences(data_test['content'])
X_padded_test = pad_sequences(X_test_seq, maxlen=1700)
X_test = X_padded_test
y_test = data_test['type'].values

# Calculate the accuracy


In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Predict on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:\n", conf_matrix)


[1m1595/1595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 22ms/step - accuracy: 0.8953 - loss: 0.2577
Test Accuracy: 89.54%
[1m1595/1595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 20ms/step
F1 Score: 0.82
Confusion Matrix:
 [[33516  2426]
 [ 2911 12174]]
