# Data Preparation 

In [29]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

In [49]:
# Load the dataset
file_path = 'E:/python/Dataset/OnlineNewsPopularity.csv'
df = pd.read_csv(file_path)

In [50]:
# Display basic info about the dataset
print("Dataset Info:\n", df.info())
print("\nSummary Statistics:\n", df.describe())
print("\nMissing Values:\n", df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   url                            39644 non-null  object 
 1   timedelta                      39644 non-null  int64  
 2   n_tokens_title                 39644 non-null  int64  
 3   tokens_content                 39644 non-null  int64  
 4   n_unique_tokens                39644 non-null  float64
 5   n_non_stop_words               39644 non-null  float64
 6   n_non_stop_unique_tokens       39644 non-null  float64
 7   num_hrefs                      39644 non-null  int64  
 8   num_self_hrefs                 39644 non-null  int64  
 9   images_content                 39644 non-null  int64  
 10  videos_content                 39644 non-null  int64  
 11  average_token_length           39644 non-null  float64
 12  keywords_content               39644 non-null 

In [45]:
# Feature Extraction from URL
def extract_url_features(url):
    parsed_url = urlparse(url)
    return pd.Series({
        'domain': parsed_url.netloc,
        'url_length': len(url),
        'num_subdirectories': url.count('/')
    })

df[['domain', 'url_length', 'num_subdirectories']] = df['url'].apply(extract_url_features)


In [46]:
# Web Scraping: Extract Title or Meta Description from URL (Optional & Time-Consuming)
def fetch_page_title(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else "No Title"
        return title
    except:
        return "Error"

In [47]:
# Uncomment the next line to scrape titles (Warning: This is slow!)
# df['page_title'] = df['url'].apply(fetch_page_title)

In [51]:
# # Feature Engineering: Creating additional features
df['content_length_category'] = pd.cut(df['tokens_content'], bins=[0, 300, 600, np.inf], labels=['Short', 'Medium', 'Long'])
df['num_media'] = df['images_content'] + df['videos_content']  # Total media count
df['keyword_density'] = df['keywords_content'] / (df['tokens_content'] + 1)  # Avoid division by zero

# Feature Engineering 

In [53]:
# # Feature Engineering: Creating additional features
# columns_to_check = ['n_tokens_content', 'num_imgs', 'num_videos', 'num_keywords']
# missing_columns = [col for col in columns_to_check if col not in df.columns]

# columns_to_check
# missing_columns

In [54]:
# if not missing_columns:
#     df['content_length_category'] = pd.cut(df['n_tokens_content'], bins=[0, 300, 600, np.inf], labels=['Short', 'Medium', 'Long'])
#     df['num_media'] = df['num_imgs'] + df['num_videos']  # Total media count
#     df['keyword_density'] = df['num_keywords'] / (df['n_tokens_content'] + 1)  # Avoid division by zero
# else:
#     print(f"Warning: Missing columns in dataset - {missing_columns}")

In [55]:
# # Text-based Feature Extraction: Sentiment Analysis
# def analyze_sentiment(text):
#     blob = TextBlob(str(text))
#     return pd.Series({
#         'text_sentiment_polarity': blob.sentiment.polarity,
#         'text_subjectivity': blob.sentiment.subjectivity
#     })

In [56]:
# # Apply sentiment analysis if 'title' column exists
# if 'title' in df.columns:
#     df[['text_sentiment_polarity', 'text_subjectivity']] = df['title'].apply(analyze_sentiment)
# else:
#     print("Warning: 'title' column not found, skipping sentiment analysis.")

