In [None]:
import pandas as pd
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load the dataset
data = pd.read_csv('Data.csv', encoding='latin1')

In [None]:
# Combine the headline columns into a single text column
data['Combined_Headlines'] = data.apply(lambda row: ' '.join(row[['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10',
                                                                  'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19',
                                                                  'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25']].astype(str)), axis=1)

In [None]:
# Clean text function
def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply text cleaning
data['Cleaned_Headlines'] = data['Combined_Headlines'].apply(clean_text)

In [None]:
# Function to get TextBlob sentiment scores
def get_textblob_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Get TextBlob sentiment scores
data['textblob_sentiment'] = data['Cleaned_Headlines'].apply(get_textblob_sentiment)

In [None]:
# Prepare data for ML model
tfidf = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features
X = tfidf.fit_transform(data['Cleaned_Headlines'])
y = data['Label']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report_output)


Accuracy: 0.4993909866017052
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.40      0.42       372
           1       0.54      0.58      0.56       449

    accuracy                           0.50       821
   macro avg       0.49      0.49      0.49       821
weighted avg       0.50      0.50      0.50       821

