In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np



In [5]:
df = pd.read_csv("complete_data.csv")  # Ensure the file is in the correct path

# Step 2: Combine text from all news columns for model training
df['combined_news'] = df[['News1', 'News2', 'News3', 'News4', 'News5']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Step 3: Create a dummy sentiment column for testing (replace this with real sentiment data if available)
df['sentiment'] = np.random.choice([0, 1], size=len(df))  # Randomly assign 0 (negative) or 1 (positive) as sentiment

# Step 4: Convert text data to feature vectors
vectorizer = CountVectorizer(max_features=2000)
X = vectorizer.fit_transform(df["combined_news"])

# Step 5: Use the 'sentiment' column as the target variable
y = df["sentiment"]

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 7: Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

# Step 9: Prediction Function
def predict_sentiment(text):
    if pd.isna(text):  # Check if the text is NaN
        return -1  # Return a special value for NaN (e.g., -1)
    text_vector = vectorizer.transform([text])
    prediction = model.predict(text_vector)
    return prediction[0]

# Step 10: Create a new DataFrame with sentiment labels for each news column
sentiment_df = pd.DataFrame()

for i in range(1, 6):  # Assuming there are 5 news columns
    news_col = f"News{i}"  # Adjusted to 'NewsX' for consistency
    sentiment_col = f"Sentiment_News{i}"  # New column to store sentiment for each news
    sentiment_df[sentiment_col] = df[news_col].apply(lambda x: predict_sentiment(x))

# Display the new DataFrame with sentiment labels for each news column
print(sentiment_df)

# Optional: Combine with the original DataFrame if needed
final_df = pd.concat([df, sentiment_df], axis=1)
final_df.to_csv('final_df.csv', index=False)

Accuracy: 0.5157142857142857
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.61      0.54       334
           1       0.55      0.43      0.48       366

    accuracy                           0.52       700
   macro avg       0.52      0.52      0.51       700
weighted avg       0.52      0.52      0.51       700

      Sentiment_News1  Sentiment_News2  Sentiment_News3  Sentiment_News4  \
0                   1                0                1                1   
1                   1                0                0                0   
2                   1                1                1                1   
3                   1                0                1                0   
4                   1                0                1                0   
...               ...              ...              ...              ...   
3495                1               -1               -1               -1   
3496         