In [10]:
import sys
!{sys.executable} -m pip install textblob

Collecting textblob
  Using cached textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Using cached textblob-0.19.0-py3-none-any.whl (624 kB)
Installing collected packages: textblob
Successfully installed textblob-0.19.0


In [11]:
# ---------------------------
# Imports
# ---------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from collections import Counter
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from textblob import TextBlob

In [13]:
# Complete Python EDA script for your Fake News dataset
# Replace DATA_PATH with your CSV file path if needed.
# DEV NOTE: The path below was provided in the conversation history; replace with actual CSV path.
DATA_PATH = "F:/PG-DBDA-2025/Project_Upload/FAKE-NEWS-CLASSIFIER/Data_using/bharatfakenewskosh/xlsx"  # <-- replace with "your_dataset.csv"

# ---------------------------
# Imports
# ---------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from collections import Counter
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from textblob import TextBlob

# If nltk resources not downloaded, uncomment:
# nltk.download('stopwords')
# nltk.download('punkt')

# ---------------------------
# Utility functions
# ---------------------------
def clean_publish_date(text):
    """
    Normalize publish_date values like "9th July 2022" to a parseable date.
    Returns pd.Timestamp or NaT.
    """
    if pd.isna(text):
        return pd.NaT
    # Remove ordinal suffixes: 1st, 2nd, 3rd, 4th...
    text = str(text).strip()
    text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', text, flags=re.IGNORECASE)
    # Try parsing common formats
    for fmt in ["%d %B %Y", "%d %b %Y", "%Y-%m-%d", "%d/%m/%Y"]:
        try:
            return pd.to_datetime(text, format=fmt)
        except Exception:
            continue
    # Fallback - try pandas parser
    try:
        return pd.to_datetime(text, dayfirst=True, errors='coerce')
    except Exception:
        return pd.NaT

def text_length_stats(series):
    """Return DataFrame with word count, char count stats for a text series."""
    s = series.fillna("")
    words = s.map(lambda x: len(str(x).split()))
    chars = s.map(lambda x: len(str(x)))
    return pd.Series({
        'count': s.shape[0],
        'non_null': s.map(bool).sum(),
        'mean_words': words.mean(),
        'median_words': words.median(),
        'std_words': words.std(),
        'mean_chars': chars.mean(),
    })

def top_n_words(series, n=30, language='english'):
    """Return top n words excluding stopwords."""
    text = " ".join(series.dropna().astype(str).tolist()).lower()
    tokens = nltk.word_tokenize(text)
    stops = set(stopwords.words(language))
    words = [w for w in tokens if w.isalpha() and w not in stops]
    return Counter(words).most_common(n)

def top_ngrams(series, ngram=2, top_n=20):
    """Return top n n-grams from a text series."""
    text = " ".join(series.dropna().astype(str).tolist()).lower()
    tokens = [t for t in nltk.word_tokenize(text) if t.isalpha()]
    ng = ngrams(tokens, ngram)
    return Counter(ng).most_common(top_n)

def simple_sentiment(text):
    """Return polarity & subjectivity using TextBlob (works on English)"""
    if pd.isna(text) or str(text).strip()=="":
        return pd.Series([np.nan, np.nan])
    tb = TextBlob(str(text))
    return pd.Series([tb.sentiment.polarity, tb.sentiment.subjectivity])

# ---------------------------
# Load dataset
# ---------------------------
print("Loading dataset:", DATA_PATH)
# attempt to load CSV; if path is image or wrong, user must replace with CSV.
try:
    df = pd.read_csv(DATA_PATH)
except Exception as e:
    print("Could not read as CSV. Please set DATA_PATH to your CSV file path. Error:", e)
    # create an empty df skeleton with columns from conversation for guidance
    cols = ["id","Author_Name","Fact_Check_Source","Source_Type","Statement","Eng_Trans_Statement",
            "News Body","Eng_Trans_News_Body","Media_Link","Publish_Date","Fact_Check_Link",
            "News_Category","Language","Region","Platform","Text","Video","Image","Label"]
    df = pd.DataFrame(columns=cols)
    print("Created empty skeleton DataFrame with expected columns. Replace DATA_PATH and re-run.")
    # Stop further execution to avoid misleading outputs
    raise SystemExit("Stop - set DATA_PATH to real CSV file and re-run.")

print("Shape:", df.shape)
display(df.head())

# ---------------------------
# PHASE 1: Basic overview
# ---------------------------
print("\n== Basic info ==")
print(df.info())

print("\n== Summary statistics (object cols) ==")
print(df.describe(include='object').T)

# Missing values per column
miss = df.isna().mean().sort_values(ascending=False)
print("\nMissing value fraction per column:\n", miss)

# Visual missing heatmap (save figure)
plt.figure(figsize=(12,4))
sns.heatmap(df.isna(), cbar=False)
plt.title("Missing data heatmap")
plt.tight_layout()
plt.savefig("missing_heatmap.png", dpi=150)
plt.close()

# Duplicates
dup_count = df.duplicated().sum()
print(f"\nDuplicate rows (full-row): {dup_count}")
# Check duplicates based on text fields (Statement + News Body)
dup_text = df.duplicated(subset=["Statement","News Body"]).sum() if set(["Statement","News Body"]).issubset(df.columns) else 0
print(f"Duplicate Statement+News Body: {dup_text}")

# ---------------------------
# PHASE 2: Data types & conversions
# ---------------------------
# Lower-case columns for convenience
df.columns = [c.strip() for c in df.columns]

# Convert Label to numeric if possible
if "Label" in df.columns:
    try:
        df["Label"] = pd.to_numeric(df["Label"], errors='coerce').astype('Int64')
    except Exception:
        pass

# Parse Publish_Date if present
if "Publish_Date" in df.columns:
    df["publish_date_parsed"] = df["Publish_Date"].apply(clean_publish_date)
    print("\nParsed publish_date sample:")
    print(df[["Publish_Date","publish_date_parsed"]].head())
    # Extract year/month/day
    df["pub_year"] = pd.DatetimeIndex(df["publish_date_parsed"]).year
    df["pub_month"] = pd.DatetimeIndex(df["publish_date_parsed"]).month
    df["pub_dayofweek"] = pd.DatetimeIndex(df["publish_date_parsed"]).day_name()
    # Plot monthly/yearly distribution
    plt.figure(figsize=(10,4))
    df["pub_month"].value_counts().sort_index().plot(kind='bar')
    plt.title("Count by publish month")
    plt.tight_layout()
    plt.savefig("publish_month_count.png", dpi=150)
    plt.close()

# ---------------------------
# PHASE 3: Categorical EDA
# ---------------------------
cat_cols = ["Author_Name","Fact_Check_Source","Source_Type","News_Category","Language","Region","Platform"]
cat_cols = [c for c in cat_cols if c in df.columns]

for c in cat_cols:
    print(f"\n-- Top values for {c} --")
    print(df[c].value_counts().head(15))
    # Save barplot for top 10
    plt.figure(figsize=(8,4))
    df[c].value_counts().head(10).plot(kind='barh')
    plt.title(f"Top 10 {c}")
    plt.tight_layout()
    plt.savefig(f"top_{c}.png", dpi=150)
    plt.close()

# Label distribution
if "Label" in df.columns:
    print("\nLabel distribution:")
    print(df["Label"].value_counts(dropna=False))
    plt.figure(figsize=(5,4))
    df["Label"].value_counts().plot(kind='bar')
    plt.title("Label distribution (False/True or 0/1)")
    plt.tight_layout()
    plt.savefig("label_distribution.png", dpi=150)
    plt.close()

# Media type counts (Text, Video, Image)
media_cols = [c for c in ["Text","Video","Image"] if c in df.columns]
if media_cols:
    print("\nMedia columns counts (non-null true/false):")
    for c in media_cols:
        # normalize common boolean-like values
        vc = df[c].fillna("no").astype(str).str.lower().value_counts()
        print(f"{c}:\n{vc}")
    # plot
    plt.figure(figsize=(6,4))
    df[media_cols].apply(lambda col: col.fillna("no").astype(str).str.lower().map(lambda x: 1 if x in ['yes','true','1','y'] else 0)).sum().plot(kind='bar')
    plt.title("Count of media types present")
    plt.tight_layout()
    plt.savefig("media_counts.png", dpi=150)
    plt.close()

# ---------------------------
# PHASE 4: Text EDA
# ---------------------------
# Choose English-translated columns when available for NLP parts
text_cols_candidates = ["Eng_Trans_Statement","Eng_Trans_News_Body","Statement","News Body"]
text_cols = [c for c in text_cols_candidates if c in df.columns]

print("\nText columns found:", text_cols)

# Basic length stats per text column
length_stats = {}
for c in text_cols:
    length_stats[c] = text_length_stats(df[c])
length_df = pd.DataFrame(length_stats).T
print("\nText length statistics:\n", length_df)
length_df.to_csv("text_length_stats.csv")

# Compare length by Label (if Label exists)
if "Label" in df.columns:
    for c in text_cols:
        plt.figure(figsize=(8,4))
        df_nonnull = df[[c,"Label"]].dropna(subset=[c])
        # create a words column
        df_nonnull["word_count"] = df_nonnull[c].astype(str).map(lambda x: len(x.split()))
        sns.boxplot(x="Label", y="word_count", data=df_nonnull)
        plt.title(f"Word count by Label for {c}")
        plt.tight_layout()
        plt.savefig(f"wordcount_by_label_{c}.png", dpi=150)
        plt.close()

# Top words and wordclouds
stop_eng = set(stopwords.words('english'))
for c in text_cols:
    # Top words
    top_words = top_n_words(df[c].astype(str).fillna(""), n=50, language='english')
    print(f"\nTop words for {c}:\n", top_words[:25])
    # WordCloud
    wc = WordCloud(width=800, height=400, collocations=False,
                   stopwords=stop_eng).generate(" ".join(df[c].dropna().astype(str).tolist()))
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud: {c}")
    plt.tight_layout()
    plt.savefig(f"wordcloud_{c}.png", dpi=150)
    plt.close()
    # Top bigrams/trigrams
    print("Top bigrams:", top_ngrams(df[c].astype(str).fillna(""), ngram=2, top_n=20)[:10])
    print("Top trigrams:", top_ngrams(df[c].astype(str).fillna(""), ngram=3, top_n=10)[:8])

# ---------------------------
# PHASE 5: Sentiment analysis (on English-translated fields)
# ---------------------------
# We'll use Eng_Trans_News_Body if present else Eng_Trans_Statement
sent_col = None
for c in ["Eng_Trans_News_Body","Eng_Trans_Statement","News Body","Statement"]:
    if c in df.columns:
        sent_col = c
        break

if sent_col:
    print(f"\nPerforming simple sentiment analysis on {sent_col} using TextBlob (polarity/subjectivity).")
    df[[f"{sent_col}_polarity", f"{sent_col}_subjectivity"]] = df[sent_col].fillna("").astype(str).apply(simple_sentiment)
    # Save sentiment summary
    print(df[[f"{sent_col}_polarity", f"{sent_col}_subjectivity"]].describe())
    df[[f"{sent_col}_polarity", f"{sent_col}_subjectivity"]].to_csv("sentiment_summary.csv")
    # Compare polarity by label if available
    if "Label" in df.columns:
        plt.figure(figsize=(8,4))
        sns.boxplot(x="Label", y=f"{sent_col}_polarity", data=df.dropna(subset=[f"{sent_col}_polarity","Label"]))
        plt.title("Polarity by Label")
        plt.tight_layout()
        plt.savefig("polarity_by_label.png", dpi=150)
        plt.close()

# ---------------------------
# PHASE 6: Cross-analysis & correlations
# ---------------------------
# Example: Label vs News_Category
if "Label" in df.columns and "News_Category" in df.columns:
    ct = pd.crosstab(df["News_Category"], df["Label"], normalize='index')
    ct.to_csv("category_label_crosstab.csv")
    print("\nCategory vs Label crosstab (percent by category):\n", ct.head())

# Correlation between numeric-ish features: word counts, sentiment, etc.
numeric_feats = []
if sent_col:
    numeric_feats += [f"{sent_col}_polarity", f"{sent_col}_subjectivity"]
for c in text_cols:
    if c in df.columns:
        df[f"{c}_wordcount"] = df[c].fillna("").astype(str).map(lambda x: len(x.split()))
        numeric_feats.append(f"{c}_wordcount")

# Add binary media indicators
for m in media_cols:
    df[f"{m}_flag"] = df[m].fillna("no").astype(str).str.lower().map(lambda x: 1 if x in ['yes','true','1','y'] else 0)
    numeric_feats.append(f"{m}_flag")

# Label as numeric
if "Label" in df.columns:
    try:
        df["label_numeric"] = pd.to_numeric(df["Label"], errors='coerce')
        numeric_feats.append("label_numeric")
    except Exception:
        pass

if numeric_feats:
    corr = df[numeric_feats].corr()
    plt.figure(figsize=(8,6))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag")
    plt.title("Correlation heatmap (numeric features)")
    plt.tight_layout()
    plt.savefig("numeric_correlation_heatmap.png", dpi=150)
    plt.close()
    corr.to_csv("numeric_feature_correlation.csv")
    print("\nSaved correlation matrix.")

# ---------------------------
# PHASE 7: Save cleaned summary outputs
# ---------------------------
# Basic cleaned snapshot (first N rows) and a CSV for EDA results
snapshot = df.head(500)
snapshot.to_csv("eda_snapshot_first500.csv", index=False)
print("\nSaved eda_snapshot_first500.csv")

# Summaries: missing, duplicates, top categories
summary = {
    "shape": df.shape,
    "missing_fraction": miss.to_dict(),
    "duplicates_rowcount": int(dup_count),
}
# Save summary as JSON-like CSV
pd.Series(summary).to_csv("eda_summary_overview.csv")
print("Saved eda_summary_overview.csv")

print("\nEDA complete. Figures saved as .png and summaries saved as CSVs in current working directory.")


Loading dataset: F:/PG-DBDA-2025/Project_Upload/FAKE-NEWS-CLASSIFIER/Data_using/bharatfakenewskosh/xlsx
Could not read as CSV. Please set DATA_PATH to your CSV file path. Error: [Errno 2] No such file or directory: 'F:/PG-DBDA-2025/Project_Upload/FAKE-NEWS-CLASSIFIER/Data_using/bharatfakenewskosh/xlsx'
Created empty skeleton DataFrame with expected columns. Replace DATA_PATH and re-run.


SystemExit: Stop - set DATA_PATH to real CSV file and re-run.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
