In [2]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load the Excel file
file_path = '/content/Assignment.xlsx'
xlsx = pd.ExcelFile(file_path)

# Display sheet names
sheet_names = xlsx.sheet_names
sheet_names

# Load the data from the "assignment" sheet
assignment_df = pd.read_excel(file_path, sheet_name='Sheet1')

# Display the first few rows of the dataframe
assignment_df.head()

def clean_article(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

assignment_df['Cleaned_Article'] = assignment_df['Article'].apply(clean_article)
display(assignment_df[['Article', 'Cleaned_Article']].head())


# Initialize the sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

def check_mood(text):
    # Get the polarity scores
    scores = sia.polarity_scores(text)
    # Determine the mood based on the compound score
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the mood check function to the cleaned articles
assignment_df['Mood_Rating'] = assignment_df['Cleaned_Article'].apply(check_mood)
assignment_df[['Cleaned_Article', 'Mood_Rating']].head()

# Using a sample dataset for sentiment labels (you need to have a labeled dataset for supervised learning)
sample_data = {
    'Cleaned_Article': [
        'Researchers find new cancer treatment big step forward',
        'City council talks new park many residents attend',
        'Scientists develop new solar panel more efficient fight climate change',
        'Economy struggling many people losing jobs worrying trend',
        'New report details latest scientific findings climate change report informative well written'
    ],
    'Mood_Rating': ['Positive', 'Neutral', 'Positive', 'Negative', 'Neutral']
}
sample_df = pd.DataFrame(sample_data)

# Combine sample data with the assignment data (only for demonstration purposes)
combined_df = pd.concat([assignment_df, sample_df], ignore_index=True)

# Check for missing values
print("Missing values before handling:")
print(combined_df.isnull().sum())

# Drop rows with missing values
combined_df.dropna(subset=['Cleaned_Article', 'Mood_Rating'], inplace=True)

# Check for missing values again
print("Missing values after handling:")
print(combined_df.isnull().sum())

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_df['Cleaned_Article'])
y = combined_df['Mood_Rating']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Classification Report:")
print(classification_report(y_test, y_pred))

def aspect_analysis(text, aspects):
    aspect_sentiments = {}
    for aspect in aspects:
        aspect_text = ' '.join([sentence for sentence in text.split('.') if aspect in sentence])
        if aspect_text:
            aspect_vector = vectorizer.transform([aspect_text])
            aspect_sentiment = model.predict(aspect_vector)
            aspect_sentiments[aspect] = aspect_sentiment[0]
    return aspect_sentiments

aspects = ['innovation', 'cost', 'plan', 'traffic', 'impact']
combined_df['Aspect_Analysis'] = combined_df['Cleaned_Article'].apply(lambda x: aspect_analysis(x, aspects))
display(combined_df[['Cleaned_Article', 'Aspect_Analysis']].head())

output_path = 'assignment_processed_with_aspect_analysis.xlsx'
combined_df.to_excel(output_path, index=False)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Article,Cleaned_Article
0,"Retailers, the makers of foods marketed for we...",retailers makers foods marketed weight loss ty...
1,"Move over, Ozempic — there’s a new drug in tow...",move ozempic — there’s new drug town eli lilly...
2,Sept 14 (Reuters) - Bristol Myers Squibb (BMY....,sept 14 reuters bristol myers squibb bmyn said...
3,Austin Wolcott was 18 years old and pretty sur...,austin wolcott 18 years old pretty sure wouldn...
4,"Cancer, often referred to as the “emperor of a...",cancer often referred “emperor maladies” unyie...


Missing values before handling:
Article            5
Cleaned_Article    0
Mood_Rating        0
dtype: int64
Missing values after handling:
Article            5
Cleaned_Article    0
Mood_Rating        0
dtype: int64
Classification Report:
              precision    recall  f1-score   support

    Positive       1.00      1.00      1.00         6

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



Unnamed: 0,Cleaned_Article,Aspect_Analysis
0,retailers makers foods marketed weight loss ty...,"{'cost': 'Positive', 'plan': 'Positive'}"
1,move ozempic — there’s new drug town eli lilly...,{'cost': 'Negative'}
2,sept 14 reuters bristol myers squibb bmyn said...,{'plan': 'Negative'}
3,austin wolcott 18 years old pretty sure wouldn...,"{'innovation': 'Negative', 'cost': 'Negative',..."
4,cancer often referred “emperor maladies” unyie...,"{'innovation': 'Negative', 'cost': 'Negative'}"
