In [1]:
import pandas as pd
from TextHandler import *

In [2]:
sources = pd.read_json('datasets/sources.json', orient='index')

df_credible_1 = pd.read_json('datasets/source_3/scraped_articles.json')
df_credible_2 = pd.read_json('datasets/source_6/scraped_articles.json')

df_not_credible_1 = pd.read_json('datasets/source_7/scraped_articles.json')
df_not_credible_2 = pd.read_json('datasets/source_14/scraped_articles.json')

print("data_Set_1_credible:",df_credible_1.count())
print("data_Set_2_credible:",df_credible_2.count())
print("data_Set_3_not-credible:",df_not_credible_1.count())
print("data_Set_4_not-credible:",df_not_credible_2.count())

data_Set_1_credible: articles    4394
dtype: int64
data_Set_2_credible: articles    1211
dtype: int64
data_Set_3_not-credible: articles    2704
dtype: int64
data_Set_4_not-credible: articles    3381
dtype: int64


In [3]:
merged_df = pd.concat([df_credible_1, df_credible_2, df_not_credible_1, df_not_credible_2])

merged_df['credibility'] = 1
merged_df.loc[merged_df.index.isin(df_not_credible_1.index) | merged_df.index.isin(df_not_credible_2.index), 'credibility'] = 0


texts = []
for article in merged_df['articles']:
    text = article['text']
    texts.append(text)

merged_df['text'] = texts

new_df = merged_df[['text', 'credibility']]
print("New DataFrame:")
print(new_df.head())

New DataFrame:
                                                text  credibility
0  تمكنت عناصر جهاز خفر السواحل الليبي، فجر اليوم...            0
1  أكد المبعوث الأميركي الخاص إلى السودان وجنوب ا...            0
2  ‬اطلع رئيس لجنة الإدارة المكلف بشركة الخليج ال...            0
3  ‬نشرت الشركة الليبية للموانئ بيانات حديثة أظهر...            0
4  طالب حراك “الاستفتاء أولا” بعرض مسودة الدستور ...            0


In [4]:
data=new_df['text']
preprocessed_data=preprocess_text(data)
print(preprocessed_data[0])

تمك عنصر جهز خفر سحل ليب فجر اليوم قاذ هجر طرق الا شواطاء الا ورب علا متن زورق ططي نقل وكل ليب نطق رئس ارك قوت بحر ان زورق زوي تحر تلق ندء غاث عمل علا قاذ هجر زال قعد طرابلس بحر نقل الا جهز كفح هجر قنن تمم جرء رحل بلد وكان خفر سحل ليب اعد ايم قلل هجر الا ليب خلل عمل قاذ نفذ طلع اعل تحدث بسم ظمه هجر دول


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, new_df['credibility'], test_size=0.2, random_state=42)

ngram_ranges = [(1, 1), (2, 2), (3, 3), (4, 4)]

for ngram_range in ngram_ranges:
    print(f"----- n-gram range: {ngram_range} -----")
    
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    
    models = {
        'LR': LogisticRegression(),
        'RF': RandomForestClassifier(),
        'KNN': KNeighborsClassifier(),
        'NB': MultinomialNB(),
        'SVM': SVC()
    }
    
    for model_name, model in models.items():
        model.fit(X_train_vectorized, y_train)
        y_pred = model.predict(X_test_vectorized)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{model_name}: Accuracy = {accuracy}')
        print(f"Accuracy for n-gram range {ngram_range}: {accuracy}")

    print("\n")


----- n-gram range: (1, 1) -----
LR: Accuracy = 0.9084687767322498
Accuracy for n-gram range (1, 1): 0.9084687767322498
RF: Accuracy = 0.9106073567151411
Accuracy for n-gram range (1, 1): 0.9106073567151411
KNN: Accuracy = 0.9020530367835757
Accuracy for n-gram range (1, 1): 0.9020530367835757
NB: Accuracy = 0.9084687767322498
Accuracy for n-gram range (1, 1): 0.9084687767322498
SVM: Accuracy = 0.9088964927288281
Accuracy for n-gram range (1, 1): 0.9088964927288281


----- n-gram range: (2, 2) -----
LR: Accuracy = 0.9084687767322498
Accuracy for n-gram range (2, 2): 0.9084687767322498
RF: Accuracy = 0.9131736526946108
Accuracy for n-gram range (2, 2): 0.9131736526946108
KNN: Accuracy = 0.9084687767322498
Accuracy for n-gram range (2, 2): 0.9084687767322498
NB: Accuracy = 0.9084687767322498
Accuracy for n-gram range (2, 2): 0.9084687767322498
SVM: Accuracy = 0.9101796407185628
Accuracy for n-gram range (2, 2): 0.9101796407185628


----- n-gram range: (3, 3) -----
LR: Accuracy = 0.908468

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(preprocess_text, new_df['credibility'], test_size=0.2, random_state=42)

ngram_ranges = [(1, 1), (2, 2), (3, 3), (4, 4)]

classifiers = [
    LogisticRegression(max_iter=1000),
    MultinomialNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    SVC()
]

for classifier in classifiers:
    classifier_name = classifier.__class__.__name__
    print(f"Classifier: {classifier_name}")
    for ngram_range in ngram_ranges:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range)
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)

        classifier.fit(X_train_tfidf, y_train)

        y_pred = classifier.predict(X_test_tfidf)

        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy for n-gram range {ngram_range}: {accuracy}")