In [1]:


import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer  # Til tekstbehandling
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, f1_score



In [2]:
df=pd.read_csv("C:/Users/Danie/GDS/CSV/Fully_cleaned_Training_set.csv")
#"C:\Users\Danie\GDS\CSV\Fully_cleaned_Training_set.csv" 

In [3]:
df['broad_category'].value_counts()

broad_category
Fake News        347272
Reliable News    330477
Name: count, dtype: int64

In [7]:
df = df.dropna(subset=['content'])
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['broad_category'], test_size=0.2, random_state=42)

X_train_vec og X_test_vec indeholder tekst nemlig 'content'.
Ved brug af modulet 'TfidfVectorizer' kan vi omdanne hver tekst til en række af vægtede tal baseret på TF-IDF (Term Frequency-Inverse Document Frequency).
- Term Frequency (TF): Hvor ofte et ord optræder i en given tekst.
- Inverse Document Frequency (IDF): Hvor sjældent ordet optræder på tværs af alle dokumenter – jo sjældnere, desto vigtigere.

In [13]:
#Vektoriser tekstdata (TF-IDF)
vectorizer = TfidfVectorizer(
    max_features=20000,
    sublinear_tf=True,
    min_df=2,
    ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [14]:
print(X_train_vec.shape)


(542197, 20000)


Dette betyder at vi har 542197 tekster og 1330616 unikke ord.

In [15]:
print(X_train_vec[0])

  (0, 9468)	0.07190273972407592
  (0, 16977)	0.12112100636865247
  (0, 6610)	0.08161806057994309
  (0, 4016)	0.13534268095289292
  (0, 17006)	0.11625624752101084
  (0, 19230)	0.07835925516299556
  (0, 14329)	0.1046953837066282
  (0, 6835)	0.12289009095992764
  (0, 5142)	0.08963320833278582
  (0, 1038)	0.05201130135068944
  (0, 19548)	0.046714035493249546
  (0, 18168)	0.04086784643993807
  (0, 9059)	0.1590468666195599
  (0, 9944)	0.03838103087026444
  (0, 12442)	0.0744364197125583
  (0, 16296)	0.11266397060755372
  (0, 18492)	0.06549543032927188
  (0, 6058)	0.041913018591987906
  (0, 10345)	0.1044863405419083
  (0, 11548)	0.06899269290240263
  (0, 16171)	0.0694431396213956
  (0, 15845)	0.04837542764221845
  (0, 1557)	0.06736938037754478
  (0, 1939)	0.06609827204092901
  (0, 18833)	0.09019538520451623
  :	:
  (0, 11642)	0.08229159115269233
  (0, 12092)	0.05928963083786174
  (0, 8270)	0.09196334690116478
  (0, 18099)	0.06888004844595562
  (0, 16958)	0.07978733282140876
  (0, 10365)	0.0556

Forklaring af outputtet når teksten er blevet nummeriseret:
Eksempel på output: 
(0, 2394)	0.09388827930025204

Række (0): Angiver, at vi ser på den første tekst i datasættet.
Kolonne (2394): Hver kolonne repræsenterer en unik funktion (ord) i vokabularet.
Værdi (0.0939 osv.): TF-IDF-vægten – en højere værdi betyder, at ordet er vigtigere i denne tekst i forhold til resten af korpuset.

In [16]:


# Træn SVM-model
model = LinearSVC(class_weight='balanced',C=0.8, dual=False, max_iter=10000)
model.fit(X_train_vec, y_train)

# Forudsig på testdata
y_pred = model.predict(X_test_vec)

# Evaluer præstation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted') 
print(f"Model accuracy: {accuracy * 100:.2f}%")
print(f"Model F1-score: {f1:.2f}")

Model accuracy: 89.38%
Model F1-score: 0.89


In [17]:
# Funktion til at forudsige om en nyhed er fake eller ægte
def predict_news(news_text):
    news_vec = vectorizer.transform([news_text])  # Vektoriser ny tekst
    prediction = model.predict(news_vec)          # Forudsig
    return prediction[0]

#Conspiracy
new_article = """
look like your use ad blocker pleas whitelist disabl abovetopsecretcom adblock tool thank
"""

result = predict_news(new_article)
print(f"Prediction: {result}")

#Reliable
new_article2 ="""editor suppos let iran bomb week review march unit state iran share interest iraq stabil well afghanistan must work togeth contain present iraqi nose dive toward civil war turn requir mutual moratorium side demon side disallow incendiari rhetor supplant prudent foreign policymak doubtless good gener result success dialogu iraq extend larger issu region secur like even motiv iran endors call gulf cooper council persian gulf nuclearweaponsfre zone kaveh l afrasiabi cambridg mass writer former advis iran nuclear negoti team author book iran nuclear program"""
result2 = predict_news(new_article2)
print(f"Prediction: {result2}")

#Political1
new_article3="""convict fraudster former pharmaceut compani ceo martin shkreli dub hate man america head jail bail revok facebook post offer cur strand hillari clinton hair us district judg kiyo matsumoto rule wednesday shkreli"""
print(f"Prediction: {predict_news(new_article3)}")

#Political2
new_article4="""red eye open thread dko list complet hit leadership delay blunt hit swing state hit toptier race knowl stork murphi embrac famili seemann newberri david vs goliath two well morrison target race verg toptier"""
print(f"Prediction: {predict_news(new_article4)}")


Prediction: Fake News
Prediction: Reliable News
Prediction: Fake News
Prediction: Reliable News
