# Progetto "Filtro per Fake News"

In [None]:
!wget https://proai-datasets.s3.eu-west-3.amazonaws.com/fake_news.zip
!unzip fake_news.zip

--2024-05-16 15:08:23--  https://proai-datasets.s3.eu-west-3.amazonaws.com/fake_news.zip
Resolving proai-datasets.s3.eu-west-3.amazonaws.com (proai-datasets.s3.eu-west-3.amazonaws.com)... 3.5.226.172, 52.95.155.82
Connecting to proai-datasets.s3.eu-west-3.amazonaws.com (proai-datasets.s3.eu-west-3.amazonaws.com)|3.5.226.172|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42975911 (41M) [application/zip]
Saving to: ‘fake_news.zip’


2024-05-16 15:08:26 (13.8 MB/s) - ‘fake_news.zip’ saved [42975911/42975911]

Archive:  fake_news.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [None]:
import pandas as pd

In [None]:
df_true = pd.read_csv("True.csv")
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
df_fake = pd.read_csv("Fake.csv")
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
df_true['label'] = 0
df_fake['label'] = 1

df = pd.concat([df_true, df_fake], axis=0).reset_index(drop=True)

In [None]:
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def spacy_cleaner(text):
    doc = nlp(text)
    lemmatized = [token.lemma_ for token in doc]
    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df['text_cleaned'] = df['text'].apply(clean_text)

In [None]:
df['text_lemmatized'] = df['text'].apply(spacy_cleaner)



1.   Le fake news sono più frequenti in una determinata categoria?

Dai risultati possiamo dedurre che le fake news sono più frequenti in determinate categorie. In particolare, le categorie Government News, Middle-east, News, US_News, e left-news contengono addirittura solo notizie false, dimostrando che c'è una maggiore tendenza nel pubblicare notizie false su temi politici/geopolitici.



In [None]:
category_counts = df.groupby(['subject', 'label']).size().unstack(fill_value=0)
category_counts['total'] = category_counts.sum(axis=1)
category_counts['fake_news_percentage'] = (category_counts[1] / category_counts['total']) * 100

print(category_counts.sort_values(by='fake_news_percentage', ascending=False))

label                0     1  total  fake_news_percentage
subject                                                  
Government News      0  1570   1570                 100.0
Middle-east          0   778    778                 100.0
News                 0  9050   9050                 100.0
US_News              0   783    783                 100.0
left-news            0  4459   4459                 100.0
politics             0  6841   6841                 100.0
politicsNews     11272     0  11272                   0.0
worldnews        10145     0  10145                   0.0


2. Per ogni categoria, ci sono argomenti che sono più soggetti alle fake news?

Nella categoria 'News' si nota come gli argomenti nei quali prevalgono le fake news riguardano sia questioni politiche (Trump, Clinton, ecc.) sia questioni sociali (polizia, diritti).

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


subjects = df['subject'].unique()

for subject in subjects:

    df_subject_fake = df[(df['label'] == 1) & (df['subject'] == subject)]

    if len(df_subject_fake) > 0:
        print(f"Analizzando la categoria: {subject}")


        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
        dtm = vectorizer.fit_transform(df_subject_fake['text'])


        LDA = LatentDirichletAllocation(n_components=5, random_state=42)
        LDA.fit(dtm)


        for i, topic in enumerate(LDA.components_):
            print(f"Top 10 parole per l'argomento #{i} della categoria {subject}:")
            top_word_indices = topic.argsort()[-10:]
            for index in top_word_indices:
                print(vectorizer.get_feature_names_out()[index])
            print("\n")
    else:
        print(f"Nessuna fake news trovata nella categoria: {subject}")
    print("------------------------------------------------------\n")

Nessuna fake news trovata nella categoria: politicsNews
------------------------------------------------------

Nessuna fake news trovata nella categoria: worldnews
------------------------------------------------------

Analizzando la categoria: News
Top 10 parole per l'argomento #0 della categoria News:
just
care
health
government
states
court
law
republicans
state
people


Top 10 parole per l'argomento #1 della categoria News:
just
com
pic
2017
people
president
realdonaldtrump
donald
twitter
trump


Top 10 parole per l'argomento #2 della categoria News:
russia
image
clinton
news
campaign
just
president
said
donald
trump


Top 10 parole per l'argomento #3 della categoria News:
white
right
image
video
police
black
like
just
said
people


Top 10 parole per l'argomento #4 della categoria News:
obama
donald
vote
cruz
republicans
party
clinton
president
republican
trump


------------------------------------------------------

Analizzando la categoria: politics
Top 10 parole per l'argomen


3.   I titoli delle fake news presentano dei pattern?

Dall'analisi degli n-gram più frequenti nei titoli delle fake news emerge che per attirare maggiore attenzione vengono utilizzati spesso nomi di personaggi politici di spicco, come "Donald Trump", "Hillary Clinton", sfruttando così l'interesse verso le vicende a loro legate. Inoltre, l'uso di espressioni come "anti Trump" tendono a creare controversie per generare determinate reazioni emotive.
Dunque, esistono dei pattern ricorrenti nei titoli delle fake news, che vengono formulati proprio per suscitare reazioni, generare controversie e catturare l'attenzione dei lettori.



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

fake_news_titles = df[df['label'] == 1]['title']


vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
X = vectorizer.fit_transform(fake_news_titles)


sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

for word, freq in words_freq[:10]:
    print(word, freq)

donald trump 805
president trump 496
hillary clinton 416
white house 348
fox news 314
trump video 292
anti trump 261
bernie sanders 254
ted cruz 224
trump just 209


In [None]:
from sklearn.model_selection import train_test_split


X = df['text_cleaned']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=2, stop_words='english')


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', random_state=42)


mlp.fit(X_train_tfidf, y_train)


y_pred = mlp.predict(X_test_tfidf)


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4330
           1       0.99      0.99      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [None]:
import pickle


with open('fake_news_detector_mlp.pkl', 'wb') as file:
    pickle.dump((tfidf_vectorizer, mlp), file)