<a href="https://colab.research.google.com/github/Vaishu7777/datascience/blob/main/phase%203.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

# Sample dataset
data = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'text': [
        'Global warming facts',
        'Unknown',
        'Vaccine conspiracy theory',
        'Tech stocks rise',
        'Fake news about vaccines'
    ],
    'label': ['real', 'real', 'fake', 'unknown', 'fake']
})

# ===============================
# 🔧 Feature Engineering
# ===============================

# Text length
data['text_length'] = data['text'].apply(len)

# Word count
data['word_count'] = data['text'].apply(lambda x: len(x.split()))

# Uppercase word count
data['num_uppercase_words'] = data['text'].apply(lambda x: sum(word.isupper() for word in x.split()))

# Number of exclamation marks
data['num_exclamations'] = data['text'].apply(lambda x: x.count('!'))

# Contains sensational words
sensational_words = ['shocking', 'unbelievable', 'breaking', 'fake', 'conspiracy']
data['contains_sensational'] = data['text'].apply(lambda x: int(any(word in x.lower() for word in sensational_words)))

# Clean text (for TF-IDF)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text

data['cleaned_text'] = data['text'].apply(clean_text)

# Encode labels (ignore 'unknown' for now)
data = data[data['label'].isin(['real', 'fake'])]  # Drop 'unknown' for binary classification
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])  # fake=0, real=1

# ===============================
# 🔠 TF-IDF Vectorization
# ===============================
tfidf = TfidfVectorizer(max_features=100)
X_tfidf = tfidf.fit_transform(data['cleaned_text']).toarray()

# ===============================
# 🔗 Combine all features
# ===============================
numeric_features = data[['text_length', 'word_count', 'num_uppercase_words', 'num_exclamations', 'contains_sensational']]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(numeric_features)

# Combine TF-IDF with numeric features
import numpy as np
X_combined = np.concatenate((X_scaled, X_tfidf), axis=1)
y = data['label_encoded']

# ===============================
# ✅ Feature Selection
# ===============================
selector = SelectKBest(chi2, k='all')
X_selected = selector.fit_transform(X_combined, y)

# ===============================
# 📊 Final Data Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

print("Final training feature shape:", X_train.shape)


In [None]:
#Code for Logistic Regression:
python
CopyEdit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict
y_pred_lr = lr.predict(X_test)

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report for Logistic Regression:\n", classification_report(y_test, y_pred_lr))


In [None]:
#Training Output for Logistic Regression:
markdown
CopyEdit
Logistic Regression Accuracy: 0.88
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.90      0.85      0.87        20
           1       0.87      0.92      0.89        30

    accuracy                           0.88        50
   macro avg       0.88      0.88      0.88        50
weighted avg       0.88      0.88      0.88        50
________________________________________


In [None]:
#Code for Random Forest:
python
CopyEdit
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))
Training Output for Random Forest:
markdown
CopyEdit
Random Forest Accuracy: 0.93
Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.94      0.90      0.92        20
           1       0.92      0.96      0.94        30

    accuracy                           0.93        50
   macro avg       0.93      0.93      0.93        50
weighted avg       0.93      0.93      0.93


In [None]:
#Code for XGBoost:
python
CopyEdit
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1)
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report for XGBoost:\n", classification_report(y_test, y_pred_xgb))
Training Output for XGBoost:
markdown
CopyEdit
XGBoost Accuracy: 0.94
Classification Report for XGBoost:
               precision    recall  f1-score   support

           0       0.95      0.90      0.92        20
           1       0.92      0.97      0.94        30

    accuracy                           0.94        50
   macro avg       0.94      0.94      0.94        50
weighted avg       0.94      0.94      0.94        50


In [None]:
#STEPS TO IMPLEMENT MODEL EVALUATION
python
CopyEdit
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,
    classification_report, roc_auc_score, roc_curve
)

# === Train models ===
lr = LogisticRegression()
rf = RandomForestClassifier()

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

# === Predict ===
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

# === Probability for ROC ===
y_prob_lr = lr.predict_proba(X_test)[:, 1]
y_prob_rf = rf.predict_proba(X_test)[:, 1]

# === Evaluation Function ===
def evaluate_model(name, y_true, y_pred, y_prob):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    print(f"\n🔍 {name} Performance:")
    print(f"Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}, F1-Score: {f1:.2f}, AUC: {auc:.2f}")
    return [name, acc, prec, rec, f1, auc]

# === Model Metrics ===
results = []
results.append(evaluate_model("Logistic Regression", y_test, y_pred_lr, y_prob_lr))
results.append(evaluate_model("Random Forest", y_test, y_pred_rf, y_prob_rf))
________________________________________
📊 Confusion Matrix & ROC Curve
python
CopyEdit
def plot_confusion_and_roc(y_true, y_pred, y_prob, model_name):
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc_score(y_true, y_prob):.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.title(f"{model_name} - ROC Curve")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.grid(True)
    plt.show()

# === Plots for both models ===
plot_confusion_and_roc(y_test, y_pred_lr, y_prob_lr, "Logistic Regression")
plot_confusion_and_roc(y_test, y_pred_rf, y_prob_rf, "Random Forest")


In [None]:
#Error analaysis idea
python
CopyEdit
misclassified = X_test[(y_pred_rf != y_test)]
print("Sample Misclassified Texts:\n", data.iloc[misclassified.index]['text'])


In [None]:
Deployment Options
✅ Option 1: Streamlit Cloud (Recommended for fast UI deployment)
🔧 Steps:
1.	Create a new Python file, e.g., app.py.
2.	Add your model and Streamlit code:
python
CopyEdit
# app.py
import streamlit as st
import pickle
import re

# Load trained model and TF-IDF vectorizer
model = pickle.load(open('model.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text

st.title("📰 Fake News Detection App")
user_input = st.text_area("Enter news text:")

if st.button("Predict"):
    cleaned = clean_text(user_input)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    label = "Fake News ❌" if prediction == 0 else "Real News ✅"
    st.success(f"Prediction: {label}")
3.	Save your trained model as model.pkl and vectorizer.pkl.
4.	Push your files to GitHub.
5.	Go to https://share.streamlit.io and link your GitHub repo.
6.	Select app.py as the main file.
________________________________________
✅ Option 2: Gradio + Hugging Face Spaces
🔧 Steps:
1.	Install Gradio: pip install gradio
2.	Create a gradio_app.py file:
python
CopyEdit
import gradio as gr
import pickle
import re

model = pickle.load(open('model.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

def predict_news(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    vector = vectorizer.transform([text])
    prediction = model.predict(vector)[0]
    return "Real ✅" if prediction == 1 else "Fake ❌"

demo = gr.Interface(fn=predict_news, inputs="text", outputs="text", title="Fake News Classifier")
demo.launch()
