In [1]:
import pandas as pd
from TextHandler import *

In [2]:
sources = pd.read_json('datasets/sources.json', orient='index')

df_credible_1 = pd.read_json('datasets/source_3/scraped_articles.json')
df_credible_2 = pd.read_json('datasets/source_6/scraped_articles.json')

df_not_credible_1 = pd.read_json('datasets/source_7/scraped_articles.json')
df_not_credible_2 = pd.read_json('datasets/source_14/scraped_articles.json')

print("data_Set_1_credible:",df_credible_1.count())
print("data_Set_2_credible:",df_credible_2.count())
print("data_Set_3_not-credible:",df_not_credible_1.count())
print("data_Set_4_not-credible:",df_not_credible_2.count())

data_Set_1_credible: articles    4394
dtype: int64
data_Set_2_credible: articles    1211
dtype: int64
data_Set_3_not-credible: articles    2704
dtype: int64
data_Set_4_not-credible: articles    3381
dtype: int64


In [3]:
df_credible_3 = pd.read_json('datasets/source_1/scraped_articles.json')

df_not_credible_3 = pd.read_json('datasets/source_8/scraped_articles.json')

print("data_Set_3_credible:",df_credible_3.count())
print("data_Set_3_not-credible:",df_not_credible_3.count())


data_Set_3_credible: articles    2288
dtype: int64
data_Set_3_not-credible: articles    3654
dtype: int64


In [4]:
df_credible_1['credibility'] = 1
df_credible_2['credibility'] = 1
df_not_credible_1['credibility'] = 0
df_not_credible_2['credibility'] = 0
df_credible_3['credibility'] = 1
df_not_credible_3['credibility'] = 0

merged_df = pd.concat([df_credible_1, df_credible_2, df_not_credible_1, df_not_credible_2,df_credible_3, df_not_credible_3])

# merged_df['credibility'] = 1
# merged_df.loc[merged_df.index.isin(df_not_credible_1.index) | merged_df.index.isin(df_not_credible_2.index), 'credibility'] = 0


texts = []
for article in merged_df['articles']:
    text = article['text']
    texts.append(text)

merged_df['text'] = texts

new_df = merged_df[['text', 'credibility']]
print("New DataFrame:")
print(new_df.head())

New DataFrame:
                                                text  credibility
0  تمكنت عناصر جهاز خفر السواحل الليبي، فجر اليوم...            1
1  أكد المبعوث الأميركي الخاص إلى السودان وجنوب ا...            1
2  ‬اطلع رئيس لجنة الإدارة المكلف بشركة الخليج ال...            1
3  ‬نشرت الشركة الليبية للموانئ بيانات حديثة أظهر...            1
4  طالب حراك “الاستفتاء أولا” بعرض مسودة الدستور ...            1


In [5]:
data=new_df['text']
preprocessed_data=preprocess_text(data)
print(preprocessed_data[0])

تمك عنصر جهز خفر سحل ليب فجر اليوم قاذ هجر طرق الا شواطاء الا ورب علا متن زورق ططي نقل وكل ليب نطق رئس ارك قوت بحر ان زورق زوي تحر تلق ندء غاث عمل علا قاذ هجر زال قعد طرابلس بحر نقل الا جهز كفح هجر قنن تمم جرء رحل بلد وكان خفر سحل ليب اعد ايم قلل هجر الا ليب خلل عمل قاذ نفذ طلع اعل تحدث بسم ظمه هجر دول


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, new_df['credibility'], test_size=0.2, random_state=42)

ngram_ranges = [(1, 1), (2, 2), (3, 3), (4, 4)]

classifiers = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC()
]

metrics_dict = {}

for ngram_range in ngram_ranges:
    ngram_metrics = {}
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    X_train_normalized = preprocessing.normalize(X_train_tfidf, norm='l2')
    X_test_normalized = preprocessing.normalize(X_test_tfidf, norm='l2')

    for classifier in classifiers:
        classifier_name = classifier.__class__.__name__
        classifier_metrics = []

        classifier.fit(X_train_normalized, y_train)
        y_pred = classifier.predict(X_test_normalized)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        classifier_metrics.extend([accuracy, precision, recall, f1])
        ngram_metrics[classifier_name] = classifier_metrics

    metrics_dict[ngram_range] = ngram_metrics

for ngram_range, ngram_metrics in metrics_dict.items():
    print(f"n-gram range {ngram_range}:")
    for classifier_name, classifier_metrics in ngram_metrics.items():
        print(f"{classifier_name}: {classifier_metrics}")


n-gram range (1, 1):
LogisticRegression: [0.8585199886589169, 0.8703448275862069, 0.8022886204704387, 0.8349321865696329]
MultinomialNB: [0.7969946129855401, 0.8870822041553749, 0.6242848061029879, 0.7328358208955223]
RandomForestClassifier: [0.8434930535866175, 0.8835462058602555, 0.7476160203432931, 0.8099173553719008]
KNeighborsClassifier: [0.8196767791324071, 0.8358422939068101, 0.7412587412587412, 0.7857142857142858]
SVC: [0.8692940175786787, 0.8871866295264624, 0.8099173553719008, 0.8467929544699236]
n-gram range (2, 2):
LogisticRegression: [0.8610717323504394, 0.9044062733383121, 0.7698664971392244, 0.8317307692307692]
MultinomialNB: [0.830450808052169, 0.9452054794520548, 0.6579783852511125, 0.7758620689655172]
RandomForestClassifier: [0.8395236745109158, 0.8834729626808835, 0.7374443738080102, 0.8038808038808041]
KNeighborsClassifier: [0.8364048766657216, 0.8502109704641351, 0.768595041322314, 0.8073455759599332]
SVC: [0.8588035157357528, 0.8943506969919296, 0.7749523204068659

In [16]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, new_df['credibility'], test_size=0.2, random_state=42)

ngram_ranges = [(1, 1), (2, 2), (3, 3), (4, 4)]

classifiers = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC()
]

metrics_dict = {}

for ngram_range in ngram_ranges:
    ngram_metrics = {}
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    Cv_train = vectorizer.fit_transform(X_train)
    Cv_test = vectorizer.transform(X_test)

    X_train_normalized = preprocessing.normalize(Cv_train, norm='l2')
    X_test_normalized = preprocessing.normalize(Cv_test, norm='l2')

    for classifier in classifiers:
        classifier_name = classifier.__class__.__name__
        classifier_metrics = []

        classifier.fit(X_train_normalized, y_train)
        y_pred = classifier.predict(X_test_normalized)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        classifier_metrics.extend([accuracy, precision, recall, f1])
        ngram_metrics[classifier_name] = classifier_metrics

    metrics_dict[ngram_range] = ngram_metrics

for ngram_range, ngram_metrics in metrics_dict.items():
    print(f"n-gram range {ngram_range}:")
    for classifier_name, classifier_metrics in ngram_metrics.items():
        print(f"{classifier_name}: {classifier_metrics}")


n-gram range (1, 1):
LogisticRegression: [0.8463283243549758, 0.8552722260509993, 0.7889383343928799, 0.8207671957671957]
MultinomialNB: [0.7734618656081655, 0.8948979591836734, 0.5575333757151939, 0.6870348609479044]
RandomForestClassifier: [0.8483130138928268, 0.8844444444444445, 0.7590591226954864, 0.816968867601779]
KNeighborsClassifier: [0.8219449957470939, 0.854463615903976, 0.7240940877304514, 0.7838953888506539]
SVC: [0.8704281258860221, 0.8896648044692738, 0.8099173553719008, 0.8479201331114808]
n-gram range (2, 2):
LogisticRegression: [0.856818826197902, 0.8744740532959326, 0.7927527018436109, 0.831610536845615]
MultinomialNB: [0.8250637935922881, 0.9458955223880597, 0.6446280991735537, 0.7667296786389415]
RandomForestClassifier: [0.8457612702013042, 0.8883018867924528, 0.7482517482517482, 0.8122843340234646]
KNeighborsClassifier: [0.8287496455911539, 0.843860894251242, 0.7558804831532104, 0.7974513749161637]
SVC: [0.856818826197902, 0.8734265734265734, 0.7940241576605213, 0.

In [10]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn import preprocessing

X = preprocessed_data
y = np.array(new_df['credibility']) 

ngram_ranges = [(1, 1), (2, 2), (3, 3), (4, 4)]

classifiers = [
    LogisticRegression(max_iter=5000), 
    MultinomialNB(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC()
]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

metrics_dict = {}

for classifier in classifiers:
    classifier_name = classifier.__class__.__name__
    classifier_metrics = []

    for ngram_range in ngram_ranges:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range)
        X_tfidf = vectorizer.fit_transform(X)

        accuracies = []
        for train_index, test_index in kf.split(X_tfidf):
            X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
            y_train, y_test = y[train_index], y[test_index]

            scaler = preprocessing.StandardScaler(with_mean=False)
            X_train_normalized = scaler.fit_transform(X_train)
            X_test_normalized = scaler.transform(X_test)

            classifier.fit(X_train_normalized, y_train)
            y_pred = classifier.predict(X_test_normalized)
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)
        
        mean_accuracy = np.mean(accuracies)
        classifier_metrics.append(mean_accuracy)

    metrics_dict[classifier_name] = classifier_metrics

for classifier_name, classifier_metrics in metrics_dict.items():
    print(f"Classifier: {classifier_name}")
    for i, ngram_range in enumerate(ngram_ranges):
        print(f"Accuracy for n-gram range {ngram_range}: {classifier_metrics[i]}")


n-gram range (1, 1):
LogisticRegression: [0.803325166688704, 0.7520025355189345, 0.8368455895611294, 0.8756787881276606]
MultinomialNB: [0.7687329538280876, 0.7887838066335847, 0.824332238111989, 0.7671637231900994]
RandomForestClassifier: [0.8746298240672118, 0.7808857137916984, 0.8192294217202187, 0.8742143708142566]
KNeighborsClassifier: [0.8729179739226941, 0.8474508749586477, 0.8660763867840361, 0.8539593821905289]
SVC: [0.8701277130766711, 0.8552331707224525, 0.7669288683293718, 0.7782000385746827]
n-gram range (2, 2):
LogisticRegression: [0.7518919036670363, 0.8765400315842968, 0.8070822320866221, 0.8341910638746831]
MultinomialNB: [0.8312877118304857, 0.7756724449362499, 0.8579672114784866, 0.8650632321490627]
RandomForestClassifier: [0.8169743215835891, 0.7674282210602178, 0.868596377571866, 0.8687948066453639]
KNeighborsClassifier: [0.7705345974551594, 0.8178320379037324, 0.8528818327937819, 0.8601063561432825]
SVC: [0.8740822992980735, 0.8070828864106551, 0.8286887977908205,

In [12]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn import preprocessing

X = preprocessed_data
y = np.array(new_df['credibility']) 

ngram_ranges = [(1, 1), (2, 2), (3, 3), (4, 4)]

classifiers = [
    LogisticRegression(max_iter=5000), 
    MultinomialNB(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC()
]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

metrics_dict = {}

for classifier in classifiers:
    classifier_name = classifier.__class__.__name__
    classifier_metrics = []

    for ngram_range in ngram_ranges:
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        X_cv = vectorizer.fit_transform(X)

        accuracies = []
        for train_index, test_index in kf.split(X_cv):
            X_train, X_test = X_cv[train_index], X_cv[test_index]
            y_train, y_test = y[train_index], y[test_index]

            scaler = preprocessing.StandardScaler(with_mean=False)
            X_train_normalized = scaler.fit_transform(X_train)
            X_test_normalized = scaler.transform(X_test)

            classifier.fit(X_train_normalized, y_train)
            y_pred = classifier.predict(X_test_normalized)
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)
        
        mean_accuracy = np.mean(accuracies)
        classifier_metrics.append(mean_accuracy)

    metrics_dict[classifier_name] = classifier_metrics

for classifier_name, classifier_metrics in metrics_dict.items():
    print(f"Classifier: {classifier_name}")
    for i, ngram_range in enumerate(ngram_ranges):
        print(f"Accuracy for n-gram range {ngram_range}: {classifier_metrics[i]}")

n-gram range (1, 1):
LogisticRegression: [0.8347197124640949, 0.8110071582672342, 0.8461463447478034, 0.7762595889094368]
MultinomialNB: [0.7611388711666636, 0.7691235627347063, 0.8478678781032152, 0.8734483158885844]
RandomForestClassifier: [0.8631520167943452, 0.8010024295656184, 0.8527426114606755, 0.8502861779440326]
KNeighborsClassifier: [0.7985466009138847, 0.839158616013457, 0.7513681956737674, 0.8285738052646322]
SVC: [0.8727320696574463, 0.7958477405268602, 0.8695736521638028, 0.8235046270990873]
n-gram range (2, 2):
LogisticRegression: [0.7778611637531285, 0.789073915871661, 0.8290788598365846, 0.8029454266458239]
MultinomialNB: [0.7633648878901645, 0.8771314836405651, 0.7883191974576838, 0.8781967352417549]
RandomForestClassifier: [0.8119366364414952, 0.8425971391634842, 0.8187779250234412, 0.8536632551164716]
KNeighborsClassifier: [0.75878944439993, 0.7590601403238866, 0.8789596546957419, 0.8768530224912646]
SVC: [0.8625358572483076, 0.7571154792896387, 0.8000368025594276, 

In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

device = tf.device("cuda:0" if tf.test.is_gpu_available() else "cpu")
if tf.test.is_gpu_available():
    print("GPU")
else:
    print("CPU")
    
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, new_df['credibility'], test_size=0.2, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_sequence_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

with device:
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train_padded, y_train, epochs=10, batch_size=32)

    y_pred = model.predict_classes(X_test_padded)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")





CPU
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9014


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

device = tf.device("cuda:0" if tf.test.is_gpu_available() else "cpu")
if tf.test.is_gpu_available():
    print("GPU")
else:
    print("CPU")
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, new_df['credibility'], test_size=0.2, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_sequence_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

with device:
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
        tf.keras.layers.GRU(64),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train_padded, y_train, epochs=10, batch_size=32)

    y_pred = model.predict_classes(X_test_padded)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")


CPU
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.8913


In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Check if a GPU is available and set the device accordingly
device = tf.device("cuda:0" if tf.test.is_gpu_available() else "cpu")
if tf.test.is_gpu_available():
    print("GPU")
else:
    print("CPU")
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, new_df['credibility'], test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of indices
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure equal length
max_sequence_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Create the RNN model
with device:
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
        tf.keras.layers.SimpleRNN(64),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(X_train_padded, y_train, epochs=10, batch_size=32)

    # Evaluate the model
    y_pred = model.predict_classes(X_test_padded)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")



CPU
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.8679
