<a href="https://colab.research.google.com/github/Val2425/MachineLearningProject-Korea2024/blob/main/MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**1. Import Data**

In [2]:
#Import dataset from Kaggle
from google.colab import files

# Upload Kaggle API key file (kaggle.json)
uploaded = files.upload()
del uploaded

Saving kaggle.json to kaggle.json


In [3]:
# Configure Kaggle API credentials
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset using Kaggle API
import kagglehub
path = kagglehub.dataset_download("bhavikjikadara/fake-news-detection")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/bhavikjikadara/fake-news-detection?dataset_version_number=1...


100%|██████████| 41.0M/41.0M [00:00<00:00, 112MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/bhavikjikadara/fake-news-detection/versions/1


In [4]:
cd /root/.cache/kagglehub/datasets/bhavikjikadara/fake-news-detection/versions/1

/root/.cache/kagglehub/datasets/bhavikjikadara/fake-news-detection/versions/1


In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load true and fake news datasets
true_df = pd.read_csv('true.csv')
fake_df = pd.read_csv('fake.csv')

In [6]:
# Replacing abbreviation to have a uniform format for dates
mois = {
    'Jan ': 'January ',
    'Feb ': 'February ',
    'Mar ': 'March ',
    'Apr ': 'April ',
    'May ': 'May ',
    'Jun ': 'June ',
    'Jul ': 'July ',
    'Aug ': 'August ',
    'Sep ': 'September ',
    'Oct ': 'October ',
    'Nov ': 'November ',
    'Dec ': 'December '
}
fake_df['date'] = fake_df['date'].replace(mois, regex=True)

# Converting dates to Date variables
true_df['date'] = pd.to_datetime(true_df['date'], errors='coerce')
fake_df['date'] = pd.to_datetime(fake_df['date'], errors='coerce')

print(true_df['date'].sample(10))
print(fake_df['date'].sample(10))

11233   2016-01-14
8200    2016-09-12
16975   2017-10-19
13409   2017-11-30
12510   2017-12-11
1209    2017-10-16
17725   2017-10-11
12115   2017-12-16
10020   2016-04-01
823     2017-11-05
Name: date, dtype: datetime64[ns]
11785   2017-01-29
9215    2017-12-12
5383    2016-07-21
16105   2017-05-10
16504   2016-07-10
18383   2017-07-05
21449   2015-09-21
20835   2016-03-21
6462    2016-05-09
9760    2017-10-03
Name: date, dtype: datetime64[ns]


In [7]:
# Add a label column: 1 for true news, 0 for fake news
true_df['label'] = 1
fake_df['label'] = 0

# Combine both datasets into a single DataFrame
df = pd.concat([true_df, fake_df], ignore_index=True)

#**2. Date column**

#**3. Subject column**

In [8]:
# Standardize the 'subject' column across datasets
subject_mapping = {
    'News': 'General News',
    'US_News': 'General News',
    'worldnews': 'General News',
    'politics': 'Politics',
    'politicsNews': 'Politics',
    'left-news': 'Politics',
    'Middle-east': 'General News',
    'Government News': 'Politics'
}
df['subject'] = df['subject'].map(subject_mapping)

# Drop the 'subject' column because it is not relevant for the analysis
df = df.drop(columns=['subject'])

#**4. Feature engineering**

In [9]:
# Function to calculate uppercase letter percentage in text with rounding to 2 decimal places
def calculate_uppercase_proportion(text):
    if len(text) == 0:
        return 0
    uppercase_count = sum(1 for char in text if char.isupper())
    percentage = (uppercase_count / len(text))
    return round(percentage, 3)  # Round to two decimal places

# Add a column for uppercase percentage in titles
df['uppercase_proportion'] = df['title'].apply(calculate_uppercase_proportion)

In [10]:
# Given the different features of the dataset, we will not be using this feature to train the dataset. It is more a tool to gain some insight about the data

#**5. Title and Text columns**

In [11]:
df['text'].iloc[0]

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [12]:
# Remove duplicates from the combined DataFrame
df.drop_duplicates(keep='first', inplace=True)

In [13]:
# Delete all text until "(Reuters) - " in the dataset
df['text'] = df['text'].str.replace(r'^.*\(Reuters\) - ', '', regex=True)

In [14]:
# Convert to lowercase
df['title'] = df['title'].str.lower()
df['text'] = df['text'].str.lower()

In [15]:
# Replace only '.' with '' in the 'text' column
df['title'] = df['title'].str.replace(r'\.', '', regex=True)
df['text'] = df['text'].str.replace(r'\.', '', regex=True)

In [16]:
# Ensure no NaN values in 'title' and 'text' before removing special characters
df['title'] = df['title'].fillna('').str.replace(r'[^a-zA-Z0-9\s]', ' ', regex=True)
df['text'] = df['text'].fillna('').str.replace(r'[^a-zA-Z0-9\s]', ' ', regex=True)

In [17]:
# Remove rows where 'title' or 'text' are empty strings
df = df[(df['title'] != '') & (df['text'] != '')]

In [18]:
# Tokenization
import nltk
from nltk import word_tokenize
nltk.download('punkt_tab')

df['title'] = df['title'].apply(word_tokenize)
df['text'] = df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [19]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
df['title'] = df['title'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [20]:
# Removing stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
df['title'] = df['title'].apply(lambda x: [word for word in x if word not in stop_words])
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
vectorizer = TfidfVectorizer()
X_title = vectorizer.fit_transform(df['title'].apply(' '.join))
X_text = vectorizer.fit_transform(df['text'].apply(' '.join))

#**6. Training**

In [22]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, make_scorer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
from tqdm import tqdm
import time
from sklearn.metrics import classification_report, f1_score

In [23]:
# Separate X and Y
X = hstack((X_title, X_text))
Y = df['label']

In [24]:
# Split the data into train + validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [25]:
#param_grid = {
#    'n_estimators': [50, 100, 200, 300],          # Number of trees
#    'max_depth': [10, 20, 30, None],              # Maximum depth of the tree
#    'min_samples_split': [2, 5, 10],              # Minimum samples required to split a node
#    'min_samples_leaf': [1, 2, 4],                # Minimum samples required at a leaf node
#    'bootstrap': [True, False],                   # Use bootstrap samples
#}
#
## Initialize the Random Forest Classifier
#rf = RandomForestClassifier(random_state=42)
#
## Perform Randomized Search with cross-validation
#random_search = RandomizedSearchCV(
#    estimator=rf,
#    param_distributions=param_grid,
#    n_iter=50,                                    # Number of parameter combinations to try
#    cv=3,                                         # 3-fold cross-validation
#    n_jobs=-1,                                    # Use all available CPU cores
#    verbose=2,                                    # Print progress
#    random_state=42,                              # Ensure reproducibility
#    scoring='f1',                                 # Optimize for F1-score
#)
#
## Fit the Randomized Search model
#random_search.fit(X_train, y_train)
#
## Get the best parameters and model
#best_params = random_search.best_params_
#best_rf = random_search.best_estimator_
#
#print("Best parameters found: ", best_params)
#
## Evaluate the tuned model on the test set
#y_pred = best_rf.predict(X_test)
#
## Calculate metrics
#accuracy = accuracy_score(y_test, y_pred)
#f1 = f1_score(y_test, y_pred, average='binary')
#
#print("Tuned Model Accuracy:", accuracy)
#print("Tuned Model F1 Score:", f1)
#print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [26]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import recall_score, f1_score, classification_report

# Define the parameter grid for SVM tuning
param_grid = {
    'C': [0.1, 1, 10, 100],                # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],   # Kernel types
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Kernel coefficient
    'degree': [2, 3, 4]                    # Degree of polynomial kernel (for 'poly')
}

# Instantiate the SVM model
svm_clf = SVC()

# Set up the RandomizedSearchCV for hyperparameter tuning
svm_tuner = RandomizedSearchCV(
    estimator=svm_clf,
    param_distributions=param_grid,
    n_iter=15,              # Number of parameter combinations to try
    cv=3,                   # 3-fold cross-validation
    scoring='f1',           # Optimize for F1 score
    verbose=2,              # Verbose output
    random_state=42,
    n_jobs=-1               # Use all available cores
)

# Fit the tuner on the training data
svm_tuner.fit(X_train, y_train)

# Display the best parameters
print("Best parameters found:", svm_tuner.best_params_)

# Evaluate the best model on the validation set
best_svm = svm_tuner.best_estimator_
y_val_pred = best_svm.predict(X_val)

val_recall = recall_score(y_val, y_val_pred, pos_label=0)
val_f1 = f1_score(y_val, y_val_pred, average='binary')
val_accuracy = best_svm.score(X_val, y_val)

print("\nValidation Results:")
print(f"Recall (Class 0): {val_recall}")
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation F1 Score: {val_f1}")
print("\nValidation Set Classification Report:")
print(classification_report(y_val, y_val_pred))

Fitting 3 folds for each of 15 candidates, totalling 45 fits




Best parameters found: {'kernel': 'rbf', 'gamma': 0.1, 'degree': 3, 'C': 10}

Validation Results:
Recall (Class 0): 0.9853780313837375
Validation Accuracy: 0.9910514541387024
Validation F1 Score: 0.9919238534756274

Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2804
           1       0.99      1.00      0.99      3454

    accuracy                           0.99      6258
   macro avg       0.99      0.99      0.99      6258
weighted avg       0.99      0.99      0.99      6258



In [27]:
from sklearn.metrics import confusion_matrix

# Final evaluation on the test set
y_test_pred = best_svm.predict(X_test)

test_recall = recall_score(y_test, y_test_pred, pos_label=0)
test_f1 = f1_score(y_test, y_test_pred, average='binary')
test_accuracy = best_svm.score(X_test, y_test)
test_cm = confusion_matrix(y_test, y_test_pred)

print("\nTest Results:")
print(f"Recall (Class 0): {test_recall}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:")
print(test_cm)


Test Results:
Recall (Class 0): 0.9855715871254163
Test Accuracy: 0.9897724367169521
Test F1 Score: 0.9905437352245863

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3604
           1       0.99      0.99      0.99      4218

    accuracy                           0.99      7822
   macro avg       0.99      0.99      0.99      7822
weighted avg       0.99      0.99      0.99      7822


Confusion Matrix:
[[3552   52]
 [  28 4190]]


In [28]:
import joblib

# Save the model to a file
joblib.dump(best_svm, 'best_svm_model.pkl')

# Load the model (example)
loaded_model = joblib.load('best_svm_model.pkl')