In [1]:
#all imports
import pandas as pd
import re
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import svm
from scipy import sparse, hstack
import string
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
fake_df = pd.read_csv("/kaggle/input/banfakenews/Fake-1K.csv")[ ['headline', 'content', 'label'] ]
authentic_df = pd.read_csv("/kaggle/input/banfakenews/Authentic-48K.csv")[ ['headline', 'content', 'label'] ]

# Rename columns
fake_df = fake_df.rename(columns={'headline': 'Headline', 'content': 'Content', 'label': 'Label'})
authentic_df = authentic_df.rename(columns={'headline': 'Headline', 'content': 'Content', 'label': 'Label'})

# Split fake data into train and test sets
fake_train, fake_test = train_test_split(fake_df, test_size=0.2, random_state=42)

# Split authentic data into train and test sets
authentic_train, authentic_test = train_test_split(authentic_df, test_size=0.2, random_state=42)

# Concatenate the train and test sets for both fake and authentic data
df_train = pd.concat([fake_train, authentic_train])
df_val = pd.concat([fake_test, authentic_test])
df_test = pd.read_csv("/kaggle/input/newnewssep2023/new_fake_real_merged.csv")[ ['Headline', 'Content', 'Label'] ]

In [3]:
def tokenizer(doc):
    puncList = ["।", "”", "“", "’"]
    for p in string.punctuation.lstrip():
        puncList.append(p)
    # remove punctuation
    tokens = []
    def cleanword(word):
        for p in puncList:
            word = word.replace(p, "")
        word = re.sub(r'[\u09E6-\u09EF]', "", word, re.DEBUG)  # replace digits

        return word

    for word in doc.split(" "):
        word = cleanword(word)
        if word != "":
            tokens.append(word)

    return tokens

In [4]:
def tfidf_charF(X, X_ret, a, b, save_model=False):

    tfidf_char = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(a, b),
                                 decode_error='replace', encoding='utf-8', analyzer='char')

    tfidf_char.fit(X.values.astype('U'))
    x_char = tfidf_char.transform(X_ret.values.astype('U'))
    return x_char

In [5]:
def tfidf_wordF(X, X_ret, a, b):

    tfidf_word = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(a, b),
                                 decode_error='replace', encoding='utf-8', analyzer='word', tokenizer=tokenizer)

    tfidf_word.fit(X.values.astype('U'))
    x_word = tfidf_word.transform(X_ret.values.astype('U'))
    return x_word

In [6]:
#load fasttext
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz
# !pip install fasttext
!gunzip /kaggle/working/cc.bn.300.bin.gz

--2024-06-02 16:19:31--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.38, 13.35.7.128, 13.35.7.50, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.38|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3934298272 (3.7G) [application/octet-stream]
Saving to: 'cc.bn.300.bin.gz'


2024-06-02 16:19:59 (136 MB/s) - 'cc.bn.300.bin.gz' saved [3934298272/3934298272]



In [7]:
import fasttext
import fasttext.util
ft = fasttext.load_model('/kaggle/working/cc.bn.300.bin')



In [8]:
from scipy import sparse
def get_sentence_vectors(texts):
    vectors = []
    for text in texts:
        words = text.split()  # Split text into words
        word_vectors = [ft.get_word_vector(word) for word in words]  # Get word vectors for each word
        if word_vectors:  # If there are words in the text
            sentence_vector = np.mean(word_vectors, axis=0)  # Calculate mean of word vectors
        else:  # If no words found
            sentence_vector = np.zeros(ft.get_dimension())  # Use zero vector
        vectors.append(sentence_vector)
#     df = pd.DataFrame(vectors)
#     df = df.fillna(0)
    return np.array(vectors)
#     return sparse.csr.csr_matrix(df.values)

In [9]:
# Get sentence vectors tfidf_charF

X_train_char = tfidf_charF(df_train['Headline']+" \\\ "+ df_train['Content'], df_train['Headline']+" \\\ "+ df_train['Content'], 3, 5)

X_val_char = tfidf_charF(df_train['Headline']+" \\\ "+ df_train['Content'], df_val['Headline']+" \\\ "+ df_val['Content'], 3, 5)

X_test_char = tfidf_charF(df_train['Headline']+" \\\ "+ df_train['Content'], df_test['Headline']+" \\\ "+ df_test['Content'], 3, 5)

In [10]:
# Get sentence vectors tfidf_wordF

X_train_word = tfidf_wordF(df_train['Headline']+" \\\ "+ df_train['Content'], df_train['Headline']+" \\\ "+ df_train['Content'], 1, 3)

X_val_word = tfidf_wordF(df_train['Headline']+" \\\ "+ df_train['Content'], df_val['Headline']+" \\\ "+ df_val['Content'], 1, 3)

X_test_word = tfidf_wordF(df_train['Headline']+" \\\ "+ df_train['Content'], df_test['Headline']+" \\\ "+ df_test['Content'], 1, 3)



In [11]:
# Get sentence vectors embedding

X_train_emb = get_sentence_vectors(df_train['Headline']+" \\\ "+ df_train['Content'])

X_val_emb = get_sentence_vectors(df_val['Headline']+" \\\ "+ df_val['Content'])

X_test_emb = get_sentence_vectors(df_test['Headline']+" \\\ "+ df_test['Content'])


In [12]:
import tensorflow as tf

# Assuming X_train_word is already defined
print("Shape of X_train_word:", X_train_word.shape)
print("Type of X_train_word:", type(X_train_word))
import tensorflow as tf

# Assuming X_train_char is already defined
print("Shape of X_train_char:", X_train_char.shape)
print("Type of X_train_char:", type(X_train_char))
# Assuming X_train_emb is already defined
print("Shape of X_train_emb:", X_train_emb.shape)
print("Type of X_train_emb:", type(X_train_emb))

# Assuming X_val_word, X_val_char, X_val_emb, X_test_word, X_test_char, X_test_emb are defined

# For validation data
print("Validation Data:")
print("Shape of X_val_word:", X_val_word.shape)
print("Type of X_val_word:", type(X_val_word))

print("Shape of X_val_char:", X_val_char.shape)
print("Type of X_val_char:", type(X_val_char))

print("Shape of X_val_emb:", X_val_emb.shape)
print("Type of X_val_emb:", type(X_val_emb))

# For test data
print("\nTest Data:")
print("Shape of X_test_word:", X_test_word.shape)
print("Type of X_test_word:", type(X_test_word))

print("Shape of X_test_char:", X_test_char.shape)
print("Type of X_test_char:", type(X_test_char))

print("Shape of X_test_emb:", X_test_emb.shape)
print("Type of X_test_emb:", type(X_test_emb))



2024-06-02 16:48:17.338297: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-02 16:48:17.338508: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-02 16:48:17.493530: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Shape of X_train_word: (39981, 633837)
Type of X_train_word: <class 'scipy.sparse._csr.csr_matrix'>
Shape of X_train_char: (39981, 746098)
Type of X_train_char: <class 'scipy.sparse._csr.csr_matrix'>
Shape of X_train_emb: (39981, 300)
Type of X_train_emb: <class 'numpy.ndarray'>
Validation Data:
Shape of X_val_word: (9996, 633837)
Type of X_val_word: <class 'scipy.sparse._csr.csr_matrix'>
Shape of X_val_char: (9996, 746098)
Type of X_val_char: <class 'scipy.sparse._csr.csr_matrix'>
Shape of X_val_emb: (9996, 300)
Type of X_val_emb: <class 'numpy.ndarray'>

Test Data:
Shape of X_test_word: (920, 633837)
Type of X_test_word: <class 'scipy.sparse._csr.csr_matrix'>
Shape of X_test_char: (920, 746098)
Type of X_test_char: <class 'scipy.sparse._csr.csr_matrix'>
Shape of X_test_emb: (920, 300)
Type of X_test_emb: <class 'numpy.ndarray'>


In [13]:
#final_train_set

X_train = sparse.hstack([X_train_char, X_train_word, X_train_emb])
X_val = sparse.hstack([X_val_char, X_val_word, X_val_emb])
X_test = sparse.hstack([X_test_char, X_test_word, X_test_emb])
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(39981, 1380235)
(9996, 1380235)
(920, 1380235)


In [14]:
#y_set

y_train = df_train['Label']
y_val = df_val['Label']
y_test = df_test['Label']

In [15]:
#define param_grid

param_grid = {
    'C': [1],
    'degree': [3],
}

best_f1_score = 0
best_params = None
best_model = None

In [16]:
#Find best model
from sklearn.svm import SVC

for C in param_grid['C']:
    for kernel in param_grid['degree']:
        # Initialize SVM model
        svm_model = SVC(kernel='linear', C=C, cache_size=7000, degree=kernel)

        # Train SVM model
        svm_model.fit(X_train, y_train)

        # Predictions on validation set
        val_predictions = svm_model.predict(X_val)

        # Calculate F1 score
        f1 = f1_score(y_val, val_predictions, average='macro')

        # Update best parameters if current F1 score is higher
        if f1 > best_f1_score:
            best_f1_score = f1
            best_params = {'C': C, 'degree': kernel}
            best_model = svm_model

print("Best Parameters:", best_params)

Best Parameters: {'C': 1, 'degree': 3}


In [17]:
#train model
# best_model.fit(X_train, y_train)

In [18]:
#test model
test_predictions = best_model.predict(X_test)

In [19]:
#get results
print("Test Set:")
print(classification_report(y_test, test_predictions, digits=5))

Test Set:
              precision    recall  f1-score   support

           0    1.00000   0.05435   0.10309       460
           1    0.51397   1.00000   0.67897       460

    accuracy                        0.52717       920
   macro avg    0.75698   0.52717   0.39103       920
weighted avg    0.75698   0.52717   0.39103       920

