In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.parse
import time

def fetch_article_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([para.get_text() for para in paragraphs])
        return content
    except Exception as e:
        print(f"Failed to fetch article content from {url}: {e}")
        return ""

def fetch_articles(query, num_articles=5):
    articles = []
    start = 0
    while len(articles) < num_articles:
        url = f'https://www.google.com/search?q={urllib.parse.quote_plus(query)}&tbm=nws&start={start}'
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        for item in soup.find_all('a'):
            link = item.get('href')
            if link and '/url?q=' in link:
                title_element = item.find('div', class_='BNeawe vvjwJb AP7Wnd')
                snippet_element = item.find('div', class_='BNeawe s3v9rd AP7Wnd')  # Snippet element
                if title_element and snippet_element:
                    title = title_element.get_text()
                    snippet = snippet_element.get_text()
                    link = link.split('/url?q=')[1].split('&')[0]
                    full_link = urllib.parse.unquote(link)
                    content = fetch_article_content(full_link)
                    articles.append({
                        'title': title,
                        'link': full_link,
                        'snippet': snippet,
                        'content': content,
                        'personality': query
                    })
                    if len(articles) >= num_articles:
                        break
        start += 10
        time.sleep(1)  # Avoid hitting Google too frequently
    return articles

def fetch_and_save_articles(personalities, num_articles=5):
    all_articles = []
    for personality in personalities:
        print(f"Fetching articles for {personality}...")
        articles = fetch_articles(personality, num_articles)
        all_articles.extend(articles)
        print(f"Fetched {len(articles)} articles for {personality}")

    # Save all articles to a single CSV file
    df = pd.DataFrame(all_articles)
    df.to_csv('all_articles.csv', index=False)
    print("Articles saved to all_articles.csv")

personalities = ["Dawood Ibrahim"]

fetch_and_save_articles(personalities)


Fetching articles for Dawood Ibrahim...
Failed to fetch article content from https://www.hindustantimes.com/india-news/honoured-to-be-his-in-law-javed-miandad-praises-dawood-ibrahim-101710894987676.html: 401 Client Error: Unauthorized for url: https://www.hindustantimes.com/india-news/honoured-to-be-his-in-law-javed-miandad-praises-dawood-ibrahim-101710894987676.html
Fetched 5 articles for Dawood Ibrahim
Articles saved to all_articles.csv


In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Load the dataset
df = pd.read_csv('all_articles.csv')

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment(text):
    if isinstance(text, str):  # Ensure the text is a string
        sentiment = analyzer.polarity_scores(text)
        return 'negative' if sentiment['neg'] > 0.5 else 'positive'
    return 'neutral'  # Handle non-string values

# Convert all entries in 'content' to strings and handle missing values
df['content'] = df['content'].fillna('').astype(str)

# Apply the sentiment analysis
df['sentiment'] = df['content'].apply(analyze_sentiment)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42)

# Use a basic count vectorizer and Naive Bayes classifier for training
vectorizer = CountVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Make predictions
y_pred = model.predict(X_test_vectorized)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the model
import joblib
joblib.dump(model, 'sentiment_analysis_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("Model and vectorizer saved.")


[[1]]
              precision    recall  f1-score   support

    positive       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

Model and vectorizer saved.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.parse
import joblib

def fetch_article_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([para.get_text() for para in paragraphs])
        return content
    except Exception as e:
        print(f"Failed to fetch article content from {url}: {e}")
        return ""

def fetch_articles(query, num_articles=5):
    articles = []
    start = 0
    while len(articles) < num_articles:
        url = f'https://www.google.com/search?q={urllib.parse.quote_plus(query)}&tbm=nws&start={start}'
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        for item in soup.find_all('a'):
            link = item.get('href')
            if link and '/url?q=' in link:
                title_element = item.find('div', class_='BNeawe vvjwJb AP7Wnd')
                snippet_element = item.find('div', class_='BNeawe s3v9rd AP7Wnd')  # Snippet element
                if title_element and snippet_element:
                    title = title_element.get_text()
                    snippet = snippet_element.get_text()
                    link = link.split('/url?q=')[1].split('&')[0]
                    full_link = urllib.parse.unquote(link)
                    content = fetch_article_content(full_link)
                    articles.append({
                        'title': title,
                        'link': full_link,
                        'snippet': snippet,
                        'content': content,
                        'personality': query
                    })
                    if len(articles) >= num_articles:
                        break
        start += 10
    return articles

def fetch_and_save_new_articles(personality, num_articles=5):
    print(f"Fetching articles for {personality}...")
    articles = fetch_articles(personality, num_articles)
    df = pd.DataFrame(articles)
    df.to_csv('new_articles.csv', index=False)
    print("New articles saved to new_articles.csv")

def predict_sentiment_for_new_articles():
    vectorizer = joblib.load('vectorizer.joblib')
    model = joblib.load('sentiment_model.joblib')

    def predict_sentiment(article_content):
        content_vectorized = vectorizer.transform([article_content])
        prediction = model.predict(content_vectorized)
        return prediction[0]

    new_articles = pd.read_csv('new_articles.csv')
    new_articles['predicted_sentiment'] = new_articles['content'].apply(predict_sentiment)
    new_articles.to_csv('new_articles_with_sentiment.csv', index=False)
    print("Sentiment predictions saved to new_articles_with_sentiment.csv")

# Fetch new articles
personality = "Pablo Escobar"  # Replace with the desired personality name
fetch_and_save_new_articles(personality)

# Predict sentiment for the new articles
predict_sentiment_for_new_articles()


Fetching articles for Pablo Escobar...
New articles saved to new_articles.csv
Sentiment predictions saved to new_articles_with_sentiment.csv


In [None]:
import joblib
import pandas as pd

# Load the vectorizer and the model
vectorizer = joblib.load('vectorizer.joblib')
model = joblib.load('sentiment_model.joblib')

# Function to predict sentiment of new articles
def predict_sentiment(article_content):
    content_vectorized = vectorizer.transform([article_content])
    prediction = model.predict(content_vectorized)
    return prediction[0]

# Example usage
new_articles = pd.read_csv('new_articles.csv')  # Assuming you have new articles in a CSV file
new_articles['predicted_sentiment'] = new_articles['content'].apply(predict_sentiment)
print(new_articles[['title', 'predicted_sentiment']])




                                               title predicted_sentiment
0             What Happened To Pablo Escobar's Body?            positive
1             What Happened To Pablo Escobar's Body?            positive
2             What Happened To Pablo Escobar's Body?            positive
3             What Happened To Pablo Escobar's Body?            positive
4  Talking Animals: Journalist Hammer Discusses R...            positive


KeyError: 'sentiment'