In [2]:
# Step 1: Upload the dataset files in Colab
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload Fake.csv and True.csv

# Step 2: Import libraries and load datasets
import pandas as pd

fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

# Step 3: Label the datasets (0 = fake, 1 = real)
fake["label"] = 0
real["label"] = 1

# Step 4: Combine datasets into one and shuffle
data = pd.concat([fake, real])
data = data.sample(frac=1).reset_index(drop=True)

# Step 5: Preview combined data
print(data.head())
print(f"Dataset shape: {data.shape}")
print(data["label"].value_counts())


Saving True.csv to True.csv
Saving Fake.csv to Fake.csv
                                               title  \
0  Far right makes most noise on Twitter in Germa...   
1   Bill O’Reilly Makes Ben Carson Admit ‘It Woul...   
2  BREAKING: DEMOCRAT Makes Shocking Statement Re...   
3  HANNITY TEARS IT UP IN HIS BEST EVER RANT: ‘Hi...   
4   President Obama: Surely, Trump Can Only Be Pr...   

                                                text    subject  \
0  FRANKFURT (Reuters) - The far right is making ...  worldnews   
1  Ben Carson just got a reality check he really ...       News   
2        https://www.youtube.com/watch?v=IioEIUmawRo   politics   
3  Sean Hannity was on fire with his opening comm...   politics   
4  Only hours before he is set to deliver the las...       News   

                  date  label  
0  September 19, 2017       1  
1    February 23, 2016      0  
2         Dec 31, 2016      0  
3         Oct 18, 2017      0  
4     January 12, 2016      0  
Dataset shap

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation and digits
    text = re.sub(r'[^a-z\s]', '', text)

    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text

# Apply preprocessing to the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Preview
data[['text', 'clean_text']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,clean_text
0,FRANKFURT (Reuters) - The far right is making ...,frankfurt reuters far right making noise twitt...
1,Ben Carson just got a reality check he really ...,ben carson got reality check really like fello...
2,https://www.youtube.com/watch?v=IioEIUmawRo,httpswwwyoutubecomwatchviioeiumawro
3,Sean Hannity was on fire with his opening comm...,sean hannity fire opening comments clintonuran...
4,Only hours before he is set to deliver the las...,hours set deliver last state union address his...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # limit features to 5000 for speed

# Fit and transform clean text
X = vectorizer.fit_transform(data['clean_text'])

# Target labels
y = data['label']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")


Train shape: (35918, 5000)
Test shape: (8980, 5000)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9894209354120267

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4731
           1       0.99      0.99      0.99      4249

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4672   59]
 [  36 4213]]
