In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
import re

def train_and_evaluate_model(x_train, y_train, x_test, y_test):
    vectorization = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
    xv_train = vectorization.fit_transform(x_train)
    xv_test = vectorization.transform(x_test)

    LR = LogisticRegression()
    LR.fit(xv_train, y_train)
    pred_lr = LR.predict(xv_test)

    score1 = accuracy_score(y_test, pred_lr) * 100
    precision1 = precision_score(y_test, pred_lr) * 100
    recall1 = recall_score(y_test, pred_lr) * 100

    return score1, precision1, recall1, LR

def preprocess_text(text):
    ps = PorterStemmer()
    text = text.lower()
    text = re.sub(r'https*\S+', ' ', text)
    text = re.sub(r'[@#]\S+', ' ', text)
    text = re.sub('<.*?>', '', text)
    
    contractions = {"don't": 'do not', "can't": 'cannot'}
    for contraction, expanded in contractions.items():
        text = text.replace(contraction, expanded)
    
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\s{2,}', ' ', text)
    
    words = word_tokenize(text)
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    text = ' '.join(words)
    return text

def main():
    nltk.download('stopwords')
    nltk.download('punkt')

    dataset = pd.read_csv("E:\data.csv")
    dataset.drop(columns=["URLs", "Body"], inplace=True, axis=1)

    # Convert labels to numeric using LabelEncoder
    le = LabelEncoder()
    dataset["Label"] = le.fit_transform(dataset["Label"])

    # Apply preprocessing to the Headline column
    dataset["Headline"] = dataset["Headline"].apply(preprocess_text)

    x = dataset["Headline"]
    y = dataset["Label"]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

    # Debugging: Print unique values in the label column
    print("Unique Values in the Label Column:")
    print(dataset["Label"].unique())

    score1, precision1, recall1, _ = train_and_evaluate_model(x_train, y_train, x_test, y_test)

    print(f"Accuracy: {score1}")
    print(f"Precision: {precision1}")
    print(f"Recall: {recall1}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unique Values in the Label Column:
[1 0]
Accuracy: 84.78802992518703
Precision: 82.43992606284658
Recall: 83.52059925093633
