In [1]:
import pandas as pd

pol_real =pd.read_csv("politifact_real.csv")
pol_fake =pd.read_csv("politifact_fake.csv")
gossip_fake =pd.read_csv("gossipcop_fake.csv")
gossip_real =pd.read_csv("gossipcop_real.csv")

In [4]:
# now we add labels

pol_real["label"] ="real"
pol_fake["label"] ="fake"
gossip_fake["label"] ="fake"
gossip_real["label"] ="real"

#now we will combine dataset

combine_data =pd.concat([pol_real, pol_fake, gossip_fake, gossip_real], ignore_index =True) 

In [5]:
print(combine_data.info())
#print(combine_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23196 entries, 0 to 23195
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         23196 non-null  object
 1   news_url   22866 non-null  object
 2   title      23196 non-null  object
 3   tweet_ids  21695 non-null  object
 4   label      23196 non-null  object
dtypes: object(5)
memory usage: 906.2+ KB
None


In [6]:
print(combine_data['label'].value_counts())   # checking the count of real and fake label in the data

label
real    17441
fake     5755
Name: count, dtype: int64


In [7]:
print(combine_data.columns)  # checking the coloumns in the data

Index(['id', 'news_url', 'title', 'tweet_ids', 'label'], dtype='object')


In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt') 
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\12348\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\12348\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
import re   # here we have imported a regular expression library
from nltk.corpus import stopwords  # stopwords from natural language tool kit
stop_words =set(stopwords.words("english"))    #load set of stop_words and convert into set

def preprocess_text(text):
    text =re.sub(r'[^\w\s]', '', text.lower())    # Remove special characters
    tokens =[word for word in text.split() if word not in stop_words]       #filter out stopwords
    return " ".join(tokens)

combine_data["cleaned_title"] =combine_data["title"].apply(preprocess_text)    # create new coloumn for preprocessed data

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer      
vectorizer =TfidfVectorizer(max_features =5000)
tfid =vectorizer.fit_transform(combine_data["cleaned_title"]) # converted clean text into numerical features

In [11]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

combine_data["entities"] = combine_data["cleaned_title"].apply(extract_entities)


In [12]:
def count_entity_types(entities, entity_type):
    return sum(1 for _, label in entities if label == entity_type)

combine_data["num_persons"] = combine_data["entities"].apply(lambda x: count_entity_types(x, "PERSON"))
combine_data["num_orgs"] = combine_data["entities"].apply(lambda x: count_entity_types(x, "ORG"))
combine_data["num_places"] = combine_data["entities"].apply(lambda x: count_entity_types(x, "GPE"))


In [13]:
def unique_entities(entities):
    return len(set([text for text, _ in entities]))

combine_data["unique_entities"] = combine_data["entities"].apply(unique_entities)


In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target column
combine_data["label"] = label_encoder.fit_transform(combine_data["label"])


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(combine_data[["num_persons", "num_orgs", "num_places", "unique_entities"]])
combine_data[["num_persons", "num_orgs", "num_places", "unique_entities"]] = scaled_features


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data
features = combine_data[["num_persons", "num_orgs", "num_places", "unique_entities"]]
target = combine_data["label"]  # Replace with your actual target column
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))


MSE: 0.18213751911344844
