In [None]:
pip install pandas nltk scikit-learn


In [7]:
import nltk
nltk.download('stopwords')
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your datasets (true news and fake news)
true_data = pd.read_csv('/content/True.csv')
fake_data = pd.read_csv('/content/Fake.csv')

# Label the data (1 for true, 0 for fake)
true_data['label'] = 1
fake_data['label'] = 0

# Combine the datasets
data = pd.concat([true_data, fake_data], ignore_index=True)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

data['text'] = data['text'].apply(preprocess_text)

# Split the data into training and testing sets
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

# Text Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:

# Feature Engineering
data['text_length'] = data['text'].apply(len)

def avg_word_length(text):
    words = text.split()
    word_lengths = [len(word) for word in words]
    return np.mean(word_lengths)

data['avg_word_length'] = data['text'].apply(avg_word_length)
data['exclamation_count'] = data['text'].apply(lambda x: x.count('!'))
data['question_count'] = data['text'].apply(lambda x: x.count('?'))
data['capitalized_count'] = data['text'].apply(lambda x: len(re.findall(r'\b[A-Z]+\b', x)))

In [None]:

# Train a model (Random Forest Classifier as an example)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

In [None]:

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(report)
