In [107]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
dfTrue = pd.read_csv('True.csv')

In [109]:
#adding a column to the true csv file and setting it = to 1 for all rows
dfTrue['True/Fake'] = 1

In [110]:
dfFake = pd.read_csv('Fake.csv')

In [111]:
#adding a column to the fake csv file and setting it = to 0 for all rows
dfFake['True/Fake'] = 0

In [112]:
#combining the csv files
df_combined = pd.concat([dfTrue, dfFake], ignore_index=True)

In [113]:
#saving to csv
df_combined.to_csv('Combined_True_Fake.csv', index=False)

In [114]:
df_combined.columns

Index(['title', 'text', 'subject', 'date', 'True/Fake'], dtype='object')

In [115]:
df_combined.shape

(44898, 5)

In [116]:
df_combined.isnull().sum()

title        0
text         0
subject      0
date         0
True/Fake    0
dtype: int64

In [117]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      44898 non-null  object
 1   text       44898 non-null  object
 2   subject    44898 non-null  object
 3   date       44898 non-null  object
 4   True/Fake  44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [118]:
# Combine title, text, and subject into content for analaysis 
df_combined['content'] = df_combined['title'] + ' ' + df_combined['text'] + ' ' + df_combined['subject']

In [119]:
# TF-IDF Vectorization, 5000 features, removes english common words 
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf.fit_transform(df_combined['content'])
feature_names = np.array(tfidf.get_feature_names_out())

In [103]:
# Split by True/Fake
df_true = df_combined[df_combined['True/Fake'] == 1]
df_fake = df_combined[df_combined['True/Fake'] == 0]

In [121]:
# Compute TF-IDF scores for True and Fake
true_tfidf = tfidf.transform(df_true['content']).mean(axis=0)
fake_tfidf = tfidf.transform(df_fake['content']).mean(axis=0)

In [122]:
# Get top keywords for each category
top_n = 10  # Number of top keywords to extract
true_keywords = feature_names[np.argsort(true_tfidf.A1)[-top_n:]]
fake_keywords = feature_names[np.argsort(fake_tfidf.A1)[-top_n:]]

In [123]:
# Display results in reverse descending order to get most influntial first
print("Top Keywords for 'True':", true_keywords[::-1])  
print("Top Keywords for 'Fake':", fake_keywords[::-1])  

Top Keywords for 'True': ['said' 'trump' 'reuters' 'president' 'house' 'state' 'government'
 'republican' 'politicsnews' 'washington']
Top Keywords for 'Fake': ['trump' 'video' 'news' 'clinton' 'obama' 'hillary' 'people' 'president'
 'just' 'said']
