In [58]:
                                                  ###THIS IS LOGICAL REGRESSION####


import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Step 1: Define the folder path
folder_name = "Combined_News_Datasets"  # Folder containing your datasets

# Step 2: Verify the files in the folder
extracted_files = os.listdir(folder_name)
print("Files in the extracted folder:")
for file in extracted_files:
    print(file)

# Step 3: Load and label the datasets
dataset_files = {
    "BuzzFeed_fake_news_content.csv": 1,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "PolitiFact_real_news_content.csv": 0,
}

datasets = []
for file, label in dataset_files.items():
    file_path = os.path.join(folder_name, file)
    if os.path.exists(file_path):
        print(f"Loading dataset: {file}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file}: {df.columns.tolist()}")
        
        if 'text' in df.columns:
            df = df[['text']]
            df['label'] = label
            datasets.append(df)
        else:
            print(f"⚠️ 'text' column not found in {file}. Skipping this file.")
    else:
        print(f"⚠️ Dataset not found: {file}")

# Step 4: Combine all datasets into one DataFrame
df = pd.concat(datasets, ignore_index=True)

# Step 5: Handle missing values
df = df.dropna(subset=['text', 'label'])

print("Sample rows from the combined dataset:")
print(df.head())

# Step 6: Prepare data for model training
X = df['text'].astype(str)
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('fake_news_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, model_file)

print("✅ Model and vectorizer saved successfully!")

# Step 7: Save the combined dataset to a CSV
combined_csv_path = os.path.join(folder_name, "combined_news_dataset.csv")
df.to_csv(combined_csv_path, index=False)

print(f"✅ Combined dataset saved to: {combined_csv_path}")


Files in the extracted folder:
BuzzFeed_fake_news_content.csv
BuzzFeed_real_news_content.csv
combined_news_dataset.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
Loading dataset: BuzzFeed_fake_news_content.csv
Columns in BuzzFeed_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_fake_news_content.csv
Columns in PolitiFact_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: BuzzFeed_real_news_content.csv
Columns in BuzzFeed_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_real_news_content.csv
Columns in PolitiFact_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 

In [60]:
                                                               ####SVM######


import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Step 1: Define the folder path
folder_name = "Combined_News_Datasets"  # Folder containing your datasets

# Step 2: Verify the files in the folder
extracted_files = os.listdir(folder_name)
print("Files in the extracted folder:")
for file in extracted_files:
    print(file)

# Step 3: Load and label the datasets
dataset_files = {
    "BuzzFeed_fake_news_content.csv": 1,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "PolitiFact_real_news_content.csv": 0,
}

datasets = []
for file, label in dataset_files.items():
    file_path = os.path.join(folder_name, file)
    if os.path.exists(file_path):
        print(f"Loading dataset: {file}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file}: {df.columns.tolist()}")
        
        if 'text' in df.columns:
            df = df[['text']]
            df['label'] = label
            datasets.append(df)
        else:
            print(f"⚠️ 'text' column not found in {file}. Skipping this file.")
    else:
        print(f"⚠️ Dataset not found: {file}")

# Step 4: Combine all datasets into one DataFrame
df = pd.concat(datasets, ignore_index=True)

# Step 5: Handle missing values
df = df.dropna(subset=['text', 'label'])

print("Sample rows from the combined dataset:")
print(df.head())

# Step 6: Prepare data for model training
X = df['text'].astype(str)
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train the SVM model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('fake_news_model_svm.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, model_file)

print("✅ SVM model and vectorizer saved successfully!")

# Step 7: Save the combined dataset to a CSV
combined_csv_path = os.path.join(folder_name, "combined_news_dataset.csv")
df.to_csv(combined_csv_path, index=False)

print(f"✅ Combined dataset saved to: {combined_csv_path}")


Files in the extracted folder:
BuzzFeed_fake_news_content.csv
BuzzFeed_real_news_content.csv
combined_news_dataset.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
Loading dataset: BuzzFeed_fake_news_content.csv
Columns in BuzzFeed_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_fake_news_content.csv
Columns in PolitiFact_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: BuzzFeed_real_news_content.csv
Columns in BuzzFeed_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_real_news_content.csv
Columns in PolitiFact_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 

In [62]:
                                               ######LINEAR CLASSIFICATION######
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Step 1: Define the folder path
folder_name = "Combined_News_Datasets"  # Folder containing your datasets

# Step 2: Verify the files in the folder
extracted_files = os.listdir(folder_name)
print("Files in the extracted folder:")
for file in extracted_files:
    print(file)

# Step 3: Load and label the datasets
dataset_files = {
    "BuzzFeed_fake_news_content.csv": 1,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "PolitiFact_real_news_content.csv": 0,
}

datasets = []
for file, label in dataset_files.items():
    file_path = os.path.join(folder_name, file)
    if os.path.exists(file_path):
        print(f"Loading dataset: {file}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file}: {df.columns.tolist()}")
        
        if 'text' in df.columns:
            df = df[['text']]
            df['label'] = label
            datasets.append(df)
        else:
            print(f"⚠️ 'text' column not found in {file}. Skipping this file.")
    else:
        print(f"⚠️ Dataset not found: {file}")

# Step 4: Combine all datasets into one DataFrame
df = pd.concat(datasets, ignore_index=True)

# Step 5: Handle missing values
df = df.dropna(subset=['text', 'label'])

print("Sample rows from the combined dataset:")
print(df.head())

# Step 6: Prepare data for model training
X = df['text'].astype(str)
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train the linear classifier (SGDClassifier)
model = SGDClassifier(loss='log_loss', max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('fake_news_model_linear.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, model_file)

print("✅ Linear classifier model and vectorizer saved successfully!")

# Step 7: Save the combined dataset to a CSV
combined_csv_path = os.path.join(folder_name, "combined_news_dataset.csv")
df.to_csv(combined_csv_path, index=False)

print(f"✅ Combined dataset saved to: {combined_csv_path}")


Files in the extracted folder:
BuzzFeed_fake_news_content.csv
BuzzFeed_real_news_content.csv
combined_news_dataset.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
Loading dataset: BuzzFeed_fake_news_content.csv
Columns in BuzzFeed_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_fake_news_content.csv
Columns in PolitiFact_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: BuzzFeed_real_news_content.csv
Columns in BuzzFeed_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_real_news_content.csv
Columns in PolitiFact_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 

In [64]:
                                                     ####FOREST CLASSIFICATION####
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Step 1: Define the folder path
folder_name = "Combined_News_Datasets"  # Folder containing your datasets

# Step 2: Verify the files in the folder
extracted_files = os.listdir(folder_name)
print("Files in the extracted folder:")
for file in extracted_files:
    print(file)

# Step 3: Load and label the datasets
dataset_files = {
    "BuzzFeed_fake_news_content.csv": 1,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "PolitiFact_real_news_content.csv": 0,
}

datasets = []
for file, label in dataset_files.items():
    file_path = os.path.join(folder_name, file)
    if os.path.exists(file_path):
        print(f"Loading dataset: {file}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file}: {df.columns.tolist()}")
        
        if 'text' in df.columns:
            df = df[['text']]
            df['label'] = label
            datasets.append(df)
        else:
            print(f"⚠️ 'text' column not found in {file}. Skipping this file.")
    else:
        print(f"⚠️ Dataset not found: {file}")

# Step 4: Combine all datasets into one DataFrame
df = pd.concat(datasets, ignore_index=True)

# Step 5: Handle missing values
df = df.dropna(subset=['text', 'label'])

print("Sample rows from the combined dataset:")
print(df.head())

# Step 6: Prepare data for model training
X = df['text'].astype(str)
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('fake_news_model_forest.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, model_file)

print("✅ Random Forest classifier model and vectorizer saved successfully!")

# Step 7: Save the combined dataset to a CSV
combined_csv_path = os.path.join(folder_name, "combined_news_dataset.csv")
df.to_csv(combined_csv_path, index=False)

print(f"✅ Combined dataset saved to: {combined_csv_path}")


Files in the extracted folder:
BuzzFeed_fake_news_content.csv
BuzzFeed_real_news_content.csv
combined_news_dataset.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
Loading dataset: BuzzFeed_fake_news_content.csv
Columns in BuzzFeed_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_fake_news_content.csv
Columns in PolitiFact_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: BuzzFeed_real_news_content.csv
Columns in BuzzFeed_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_real_news_content.csv
Columns in PolitiFact_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 

In [70]:
!pip install xgboost




In [72]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Step 1: Define the folder path
folder_name = "Combined_News_Datasets"  # Folder containing your datasets

# Step 2: Verify the files in the folder
extracted_files = os.listdir(folder_name)
print("Files in the extracted folder:")
for file in extracted_files:
    print(file)

# Step 3: Load and label the datasets
dataset_files = {
    "BuzzFeed_fake_news_content.csv": 1,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "PolitiFact_real_news_content.csv": 0,
}

datasets = []
for file, label in dataset_files.items():
    file_path = os.path.join(folder_name, file)
    if os.path.exists(file_path):
        print(f"Loading dataset: {file}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file}: {df.columns.tolist()}")
        
        if 'text' in df.columns:
            df = df[['text']]
            df['label'] = label
            datasets.append(df)
        else:
            print(f"⚠️ 'text' column not found in {file}. Skipping this file.")
    else:
        print(f"⚠️ Dataset not found: {file}")

# Step 4: Combine all datasets into one DataFrame
df = pd.concat(datasets, ignore_index=True)

# Step 5: Handle missing values
df = df.dropna(subset=['text', 'label'])

print("Sample rows from the combined dataset:")
print(df.head())

# Step 6: Prepare data for model training
X = df['text'].astype(str)
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train the XGBoost classifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('fake_news_model_xgboost.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, model_file)

print("✅ XGBoost classifier model and vectorizer saved successfully!")

# Step 7: Save the combined dataset to a CSV
combined_csv_path = os.path.join(folder_name, "combined_news_dataset.csv")
df.to_csv(combined_csv_path, index=False)

print(f"✅ Combined dataset saved to: {combined_csv_path}")


Files in the extracted folder:
BuzzFeed_fake_news_content.csv
BuzzFeed_real_news_content.csv
combined_news_dataset.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
Loading dataset: BuzzFeed_fake_news_content.csv
Columns in BuzzFeed_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_fake_news_content.csv
Columns in PolitiFact_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: BuzzFeed_real_news_content.csv
Columns in BuzzFeed_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_real_news_content.csv
Columns in PolitiFact_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 

Parameters: { "use_label_encoder" } are not used.




Accuracy: 0.4251968503937008

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.40      0.39        58
           1       0.47      0.45      0.46        69

    accuracy                           0.43       127
   macro avg       0.42      0.42      0.42       127
weighted avg       0.43      0.43      0.43       127

✅ XGBoost classifier model and vectorizer saved successfully!
✅ Combined dataset saved to: Combined_News_Datasets\combined_news_dataset.csv


In [74]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Step 1: Define the folder path
folder_name = "Combined_News_Datasets"  # Folder containing your datasets

# Step 2: Verify the files in the folder
extracted_files = os.listdir(folder_name)
print("Files in the extracted folder:")
for file in extracted_files:
    print(file)

# Step 3: Load and label the datasets
dataset_files = {
    "BuzzFeed_fake_news_content.csv": 1,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "PolitiFact_real_news_content.csv": 0,
}

datasets = []
for file, label in dataset_files.items():
    file_path = os.path.join(folder_name, file)
    if os.path.exists(file_path):
        print(f"Loading dataset: {file}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file}: {df.columns.tolist()}")
        
        if 'text' in df.columns:
            df = df[['text']]
            df['label'] = label
            datasets.append(df)
        else:
            print(f"⚠️ 'text' column not found in {file}. Skipping this file.")
    else:
        print(f"⚠️ Dataset not found: {file}")

# Step 4: Combine all datasets into one DataFrame
df = pd.concat(datasets, ignore_index=True)

# Step 5: Handle missing values
df = df.dropna(subset=['text', 'label'])

print("Sample rows from the combined dataset:")
print(df.head())

# Step 6: Prepare data for model training
X = df['text'].astype(str)
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train the Decision Tree classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('fake_news_model_decision_tree.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, model_file)

print("✅ Decision Tree classifier model and vectorizer saved successfully!")

# Step 7: Save the combined dataset to a CSV
combined_csv_path = os.path.join(folder_name, "combined_news_dataset.csv")
df.to_csv(combined_csv_path, index=False)

print(f"✅ Combined dataset saved to: {combined_csv_path}")


Files in the extracted folder:
BuzzFeed_fake_news_content.csv
BuzzFeed_real_news_content.csv
combined_news_dataset.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
Loading dataset: BuzzFeed_fake_news_content.csv
Columns in BuzzFeed_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_fake_news_content.csv
Columns in PolitiFact_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: BuzzFeed_real_news_content.csv
Columns in BuzzFeed_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_real_news_content.csv
Columns in PolitiFact_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 

In [76]:


import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Step 1: Define the folder path
folder_name = "Combined_News_Datasets"  # Folder containing your datasets

# Step 2: Verify the files in the folder
extracted_files = os.listdir(folder_name)
print("Files in the extracted folder:")
for file in extracted_files:
    print(file)

# Step 3: Load and label the datasets
dataset_files = {
    "BuzzFeed_fake_news_content.csv": 1,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "PolitiFact_real_news_content.csv": 0,
}

datasets = []
for file, label in dataset_files.items():
    file_path = os.path.join(folder_name, file)
    if os.path.exists(file_path):
        print(f"Loading dataset: {file}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file}: {df.columns.tolist()}")
        
        if 'text' in df.columns:
            df = df[['text']]
            df['label'] = label
            datasets.append(df)
        else:
            print(f"⚠️ 'text' column not found in {file}. Skipping this file.")
    else:
        print(f"⚠️ Dataset not found: {file}")

# Step 4: Combine all datasets into one DataFrame
df = pd.concat(datasets, ignore_index=True)

# Step 5: Handle missing values
df = df.dropna(subset=['text', 'label'])

print("Sample rows from the combined dataset:")
print(df.head())

# Step 6: Prepare data for model training
X = df['text'].astype(str)
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Train the Decision Tree classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('fake_news_model_decision_tree.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, model_file)

print("✅ Decision Tree classifier model and vectorizer saved successfully!")

# Step 7: Save the combined dataset to a CSV
combined_csv_path = os.path.join(folder_name, "combined_news_dataset.csv")
df.to_csv(combined_csv_path, index=False)

print(f"✅ Combined dataset saved to: {combined_csv_path}")


Files in the extracted folder:
BuzzFeed_fake_news_content.csv
BuzzFeed_real_news_content.csv
combined_news_dataset.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
Loading dataset: BuzzFeed_fake_news_content.csv
Columns in BuzzFeed_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_fake_news_content.csv
Columns in PolitiFact_fake_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: BuzzFeed_real_news_content.csv
Columns in BuzzFeed_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 'source', 'publish_date', 'movies', 'images', 'canonical_link', 'meta_data']
Loading dataset: PolitiFact_real_news_content.csv
Columns in PolitiFact_real_news_content.csv: ['id', 'title', 'text', 'url', 'top_img', 'authors', 