In [64]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
import pickle


In [None]:
# Load the data
data = pd.read_csv(r'c:\Users\User\Downloads\fake_news_dataset.csv')
print("Initial data shape:", data.shape)
print("\nNull values in each column:")
print(data.isnull().sum())

# Only drop rows where both title and text are null
data = data.dropna(subset=['title', 'text'], how='all')
print("\nData shape after dropping rows with all null text:", data.shape)

# Fill remaining NaN values with empty strings
data['title'] = data['title'].fillna('')
data['text'] = data['text'].fillna('')

Initial data shape: (62769, 7)

Null values in each column:
id            62719
title           518
text              1
label             0
Unnamed: 0       50
subject       62769
date          62769
dtype: int64

Data shape after dropping rows with all null text: (62769, 7)


In [66]:
if data['label'].dtype == 'object':
    data['label'] = data['label'].map({'FAKE': 0, 'REAL': 1, 'fake': 0, 'real': 1})

In [67]:
data['content'] = data['title'] + " " + data['text']

In [68]:
# Step 6: Define a text cleaning function
def clean_text(text):
    text = text.lower()                             # Convert to lowercase
    text = re.sub('\[.*?\]', '', text)              # Remove text in brackets
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub('\w*\d\w*', '', text)             # Remove words with numbers
    text = re.sub('https?://\S+|www\.\S+', '', text) # Remove URLs
    return text


  text = re.sub('\[.*?\]', '', text)              # Remove text in brackets
  text = re.sub('\w*\d\w*', '', text)             # Remove words with numbers
  text = re.sub('https?://\S+|www\.\S+', '', text) # Remove URLs


In [69]:
data['content'] = data['content'].apply(clean_text)

In [None]:
# Prepare features and target
# Drop rows with missing labels
if data['label'].isnull().any():
    print("Dropping rows with missing labels...")
    data = data.dropna(subset=['label'])

X = data['content']
y = data['label']

print("Features shape:", X.shape)
print("Labels shape:", y.shape)
print("\nUnique label values:", y.unique())

Features shape: (62769,)
Labels shape: (62769,)

Unique label values: [ 1.  0. nan]


In [71]:
# Clean and prepare the label data
print("Label value counts before cleaning:")
print(data['label'].value_counts(dropna=False))

# Drop rows with NaN labels
data = data.dropna(subset=['label'])
print("\nLabel value counts after cleaning:")
print(data['label'].value_counts())

Label value counts before cleaning:
label
NaN    62719
1.0       25
0.0       25
Name: count, dtype: int64

Label value counts after cleaning:
label
1.0    25
0.0    25
Name: count, dtype: int64


In [72]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_vectorized = vectorizer.fit_transform(X)

In [73]:
# Check the content before vectorization
print("Data shape:", data.shape)
print("\nFirst few rows of content:")
print(data['content'].head())
print("\nSample lengths of content:")
print(data['content'].str.len().head())

Data shape: (50, 8)

First few rows of content:
0    latest developments in technology sector confi...
1    latest developments in health sector confirmed...
2    politics scandal sparks controversy online rum...
3    technology scandal sparks controversy online a...
4    latest developments in sports sector confirmed...
Name: content, dtype: object

Sample lengths of content:
0    341
1    335
2    333
3    344
4    328
Name: content, dtype: int64


In [74]:
# Step 10: Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)


In [75]:
model = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

ValueError: Input y contains NaN.

In [None]:
y_pred_log = model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))

Logistic Regression Accuracy: 0.9487016090489088


In [None]:
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_log))


Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6971
           1       0.94      0.95      0.94      5574
        FAKE       1.00      1.00      1.00         4
        REAL       1.00      1.00      1.00         5

    accuracy                           0.95     12554
   macro avg       0.97      0.97      0.97     12554
weighted avg       0.95      0.95      0.95     12554



In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model and Vectorizer saved successfully!")

âœ… Model and Vectorizer saved successfully!


In [1]:
x = [1, 2, 3]
y = x
y.append(4)
print(x)

[1, 2, 3, 4]
