In [15]:
import pandas as pd

fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

fake['label'] = 0
real['label'] = 1

data = pd.concat([fake, real])
data = data.sample(frac=1).reset_index(drop=True)


In [16]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nanga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Step 3: Clean the text (using split instead of word_tokenize)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    cleaned_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(cleaned_tokens)


In [18]:
# Step 4: Apply and preview
data['clean_text'] = data['text'].apply(clean_text)
data[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,The White House confirmed to FOX Business that...,white house confirmed fox business priebus wou...
1,ISLAMABAD (Reuters) - A Pakistani court issued...,islamabad reuters pakistani court issued arres...
2,"Watch CNN, Dana Bash ask Donald a question she...",watch cnn dana bash ask donald question wishes...
3,BARCELONA (Reuters) - Police will remove peopl...,barcelona reuters police remove people catalan...
4,Trump is not your traditional Republican candi...,trump traditional republican candidate anyone ...


In [19]:
#phase 4 

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [21]:
vectorizer = TfidfVectorizer(max_features=5000)  # limit to 5000 most important words


In [22]:
X = vectorizer.fit_transform(data['clean_text'])  # feature matrix
y = data['label']  # labels: 0 = Fake, 1 = Real


In [23]:
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (44898, 5000)
y shape: (44898,)


In [24]:
#phase 5

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [27]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)


In [28]:
y_pred_log = log_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


Logistic Regression Accuracy: 0.9887527839643653
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4685
           1       0.99      0.99      0.99      4295

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Confusion Matrix:
 [[4625   60]
 [  41 4254]]


In [29]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)


In [1]:
#Phase 6

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
import pandas as pd

# Load the fake and real news CSV files
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

# Add label: 0 = Fake, 1 = Real
fake['label'] = 0
real['label'] = 1

# Combine both
data = pd.concat([fake, real])
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the data


In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return ' '.join([word for word in tokens if word not in stop_words])

# Apply text cleaning
data['clean_text'] = data['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nanga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['clean_text'])
y = data['label']


In [6]:
# Split the dataset: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Support Vector Machine (Linear)
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [8]:
# Logistic Regression Results
print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Report:\n", classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))

# SVM Results
print("\n=== SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

# Random Forest Results
print("\n=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


=== Logistic Regression ===
Accuracy: 0.988641425389755
Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4677
           1       0.99      0.99      0.99      4303

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Confusion Matrix:
 [[4615   62]
 [  40 4263]]

=== SVM ===
Accuracy: 0.99543429844098
Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      4677
           1       0.99      1.00      1.00      4303

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Confusion Matrix:
 [[4651   26]
 [  15 4288]]

=== Random Forest ===
Accuracy: 0.9981069042316258
Report:
               precision    recall  f1-score   support

           0       1.00 