<a href="https://colab.research.google.com/github/Val2425/MachineLearningProject-Korea2024/blob/main/MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**1. Import Data**

In [1]:
#Import dataset from Kaggle
from google.colab import files

# Upload Kaggle API key file (kaggle.json)
uploaded = files.upload()
del uploaded

Saving kaggle.json to kaggle.json


In [2]:
# Configure Kaggle API credentials
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset using Kaggle API
import kagglehub
path = kagglehub.dataset_download("bhavikjikadara/fake-news-detection")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/bhavikjikadara/fake-news-detection?dataset_version_number=1...


100%|██████████| 41.0M/41.0M [00:00<00:00, 114MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/bhavikjikadara/fake-news-detection/versions/1


In [3]:
cd /root/.cache/kagglehub/datasets/bhavikjikadara/fake-news-detection/versions/1

/root/.cache/kagglehub/datasets/bhavikjikadara/fake-news-detection/versions/1


In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load true and fake news datasets
true_df = pd.read_csv('true.csv')
fake_df = pd.read_csv('fake.csv')

In [19]:
# Replacing abbreviation to have a uniform format for dates
mois = {
    'Jan ': 'January ',
    'Feb ': 'February ',
    'Mar ': 'March ',
    'Apr ': 'April ',
    'May ': 'May ',
    'Jun ': 'June ',
    'Jul ': 'July ',
    'Aug ': 'August ',
    'Sep ': 'September ',
    'Oct ': 'October ',
    'Nov ': 'November ',
    'Dec ': 'December '
}
fake_df['date'] = fake_df['date'].replace(mois, regex=True)

# Converting dates to Date variables
true_df['date'] = pd.to_datetime(true_df['date'], errors='coerce')
fake_df['date'] = pd.to_datetime(fake_df['date'], errors='coerce')

print(true_df['date'].sample(10))
print(fake_df['date'].sample(10))

14813   2017-11-14
3499    2017-05-31
5993    2017-01-26
13391   2017-11-30
7908    2016-10-07
5579    2017-02-08
10011   2016-04-08
20339   2017-09-12
9353    2016-05-28
10112   2016-03-31
Name: date, dtype: datetime64[ns]
15101   2015-10-11
10015   2017-08-29
23029   2017-01-27
10391   2017-07-16
844     2017-07-16
19646   2016-11-11
12173   2016-12-11
12588   2016-10-28
12048   2016-12-28
12137   2016-12-15
Name: date, dtype: datetime64[ns]


In [20]:
# Add a label column: 1 for true news, 0 for fake news
true_df['label'] = 1
fake_df['label'] = 0

# Combine both datasets into a single DataFrame
df = pd.concat([true_df, fake_df], ignore_index=True)

#**2. Date column**

#**3. Subject column**

In [21]:
# Standardize the 'subject' column across datasets
subject_mapping = {
    'News': 'General News',
    'US_News': 'General News',
    'worldnews': 'General News',
    'politics': 'Politics',
    'politicsNews': 'Politics',
    'left-news': 'Politics',
    'Middle-east': 'General News',
    'Government News': 'Politics'
}
df['subject'] = df['subject'].map(subject_mapping)

# Drop the 'subject' column because it is not relevant for the analysis
df = df.drop(columns=['subject'])

#**4. Feature engineering**

In [22]:
# Function to calculate uppercase letter percentage in text with rounding to 2 decimal places
def calculate_uppercase_proportion(text):
    if len(text) == 0:
        return 0
    uppercase_count = sum(1 for char in text if char.isupper())
    percentage = (uppercase_count / len(text))
    return round(percentage, 3)  # Round to two decimal places

# Add a column for uppercase percentage in titles
df['uppercase_proportion'] = df['title'].apply(calculate_uppercase_proportion)

#**5. Title and Text columns**

In [23]:
df['text'].iloc[0]

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [25]:
# Remove duplicates from the combined DataFrame
df.drop_duplicates(keep='first', inplace=True)

In [27]:
# Delete all text until "(Reuters) - " in the dataset
df['text'] = df['text'].str.replace(r'^.*\(Reuters\) - ', '', regex=True)

In [29]:
# Convert to lowercase
df['title'] = df['title'].str.lower()
df['text'] = df['text'].str.lower()

In [31]:
# Replace only '.' with '' in the 'text' column
df['title'] = df['title'].str.replace(r'\.', '', regex=True)
df['text'] = df['text'].str.replace(r'\.', '', regex=True)

In [33]:
# Ensure no NaN values in 'title' and 'text' before removing special characters
df['title'] = df['title'].fillna('').str.replace(r'[^a-zA-Z0-9\s]', ' ', regex=True)
df['text'] = df['text'].fillna('').str.replace(r'[^a-zA-Z0-9\s]', ' ', regex=True)

In [35]:
# Remove rows where 'title' or 'text' are empty strings
df = df[(df['title'] != '') & (df['text'] != '')]

In [18]:
# Tokenization
import nltk
from nltk import word_tokenize
nltk.download('punkt_tab')

df['title'] = df['title'].apply(word_tokenize)
df['text'] = df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [19]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
df['title'] = df['title'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [20]:
# Removing stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
df['title'] = df['title'].apply(lambda x: [word for word in x if word not in stop_words])
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
vectorizer = TfidfVectorizer()
X_title = vectorizer.fit_transform(df['title'].apply(' '.join))
X_text = vectorizer.fit_transform(df['text'].apply(' '.join))

#**6. Training**

In [22]:
# Separate X and Y
X = hstack((X_title, X_text))
Y = df['label']

In [None]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [23]:


from sklearn.ensemble import RandomForestClassifier
# Train a classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate
accuracy = clf.score(X_test, y_test)
print("Model accuracy:", accuracy)


Model accuracy: 0.9689337765277423


In [24]:
from sklearn.metrics import f1_score
# Make predictions
y_pred = clf.predict(X_test)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='binary')  # Use 'binary' for binary classification
# or if you have multi-class classification, use 'macro' or 'weighted'
# f1 = f1_score(y_test, y_pred, average='macro')
# f1 = f1_score(y_test, y_pred, average='weighted')

print("Model accuracy:", clf.score(X_test, y_test))
print("F1 Score:", f1)

Model accuracy: 0.9689337765277423
F1 Score: 0.9716816221885561


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train an SVM classifier
svm_clf = SVC()  # You can specify kernel='linear', 'rbf', etc., based on your needs
svm_clf.fit(X_train, y_train)

# Make predictions
y_pred = svm_clf.predict(X_test)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='binary')

# Evaluate and print results
accuracy = svm_clf.score(X_test, y_test)
print("Model accuracy:", accuracy)
print("F1 Score:", f1)


Model accuracy: 0.989516747634876
F1 Score: 0.9903210576015109
