In [None]:
# Installing necessary libraries
pip install nltk spacy numpy pandas scikit-learn

In [54]:
# If executing this jupyter notebook using google colab uncomment the below two lines

# from google.colab import drive
# drive.mount('/content/drive')

# Import necessary libraries
import os
import nltk
import spacy
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Downloading NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')  # Downloading the lexicon for Sentiment Analysis


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

In [55]:
# Initializing stopwords
stop_words = set(stopwords.words('english'))

# Initializing SentimentIntensityAnalyzer using nltk.sentiment
sentimentIntensityAnalyzer = SentimentIntensityAnalyzer()

In [56]:
# Function to preprocess text
def preprocess_text(text):
    """
    Function preprocesses the input string text by converting it to lowercase, tokenizing it,
    removing stop words, applying stemming, and joining tokens back into a string and returns the preprocessed text (string).
    """
    # Convert text to lowercase
    # text = text.lower()
    lowercase_text = text.lower()

    # Tokenization
    tokens = word_tokenize(lowercase_text)

    # Removing stop words
    tokens = [token for token in tokens if token not in stop_words]

    # Applying stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Joining tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [57]:
# Named Entity Recognition (NER) using spaCy
naturalLangaugeProcessor = spacy.load('en_core_web_sm')

def extract_named_entities(text):
    """
    Function extracts named entities from the given text, takes string input and returns list of named entities found in the text.
    """
    nlp_doc = naturalLangaugeProcessor(text)
    named_entities = [entity.text for entity in nlp_doc.ents]
    return named_entities

In [58]:
# Sentiment Analysis using NLTK
def get_sentiment(text):
    """
    Function performs sentiment analysis on the input string text using NLTK's SentimentIntensityAnalyzer 
    and returns sentiment polarity value (-1 for negative, 0 for neutral, 1 for positive).
    """
    # Analyze sentiment using SentimentIntensityAnalyzer
    sentiment_scores = sentimentIntensityAnalyzer.polarity_scores(text)

    # Extract compound score
    compound_score = sentiment_scores['compound']

    # Classify sentiment based on compound score
    if compound_score >= 0.05:
        return 1  # Positive sentiment
    elif compound_score <= -0.05:
        return -1  # Negative sentiment
    else:
        return 0  # Neutral sentiment

In [59]:
# Word Embeddings using spaCy
def get_word_embeddings(text):
    """
    Function computes word embeddings for the given text (string) using spaCy and return vector representation of the text.
    """
    doc = naturalLangaugeProcessor(text)
    return doc.vector

In [60]:
# If executing this jupyter notebook using google colab uncomment the "Drive data path" and comment "Local data path", 
# default path: local.

# Drive data path
# data_path = "/content/drive/MyDrive/datasets_coursework1/bbc"

# Local data path
data_path = "../Part 2/bbc"

# Initialize list to store preprocessed data
preprocessed_data = []

# List of news categories
news_categories = ['tech', 'business', 'sport', 'politics', 'entertainment']

In [44]:
# Loop through news categories and dataset files
for category in news_categories:
    category_path = os.path.join(data_path, category)
    files = os.listdir(category_path)
    for file in files:
        with open(os.path.join(category_path, file), 'r', encoding='latin-1') as f:
            text = f.read()
            preprocessed_text = preprocess_text(text)
            named_entities = extract_named_entities(text)  # Extracting named entities
            sentiments = get_sentiment(text)  # Performing sentiment analysis
            word_embeddings = get_word_embeddings(text)  # Computing word embeddings
            preprocessed_data.append({
                'text': preprocessed_text,
                'category': category,
                'named_entities': named_entities,
                'sentiments': sentiments,
                'word_embeddings': word_embeddings
                })

In [45]:
# Creating a DataFrame from the preprocessed data
dataframe = pd.DataFrame(preprocessed_data)

# Feature Extraction (TF-IDF) using sklearn.feature_extraction.text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataframe['text'])
y = dataframe['category']

# Initializing Random Forest classifier from sklearn.ensemble
randomForestClassifier = RandomForestClassifier(random_state=42)

# Initializing feature selector using SelectKBest, chi2 from sklearn.feature_selection
feature_selector = SelectKBest(chi2, k=1000)

# Initializing stratified k-fold cross-validation using sklearn.model_selection
skf_cross_validator = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [47]:
# Cross-Validation with Grid Search

# Defining the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Performing cross-validation using StratifiedKFold
for train_index, test_index in skf_cross_validator.split(X, y):
    # Spliting the data into training and testing sets based on the indices generated by StratifiedKFold
    X_train_test, X_test = X[train_index], X[test_index]  # Training and testing data for features (TF-IDF matrix)
    y_train_test, y_test = y[train_index], y[test_index]  # Training and testing data for target labels

    # Further spliting the training set into train and development sets
    for train_index_cv, dev_index in skf_cross_validator.split(X_train_test, y_train_test):
        X_train_cv, X_dev_cv = X_train_test[train_index_cv], X_train_test[dev_index]
        y_train_cv, y_dev_cv = y_train_test.iloc[train_index_cv], y_train_test.iloc[dev_index]

        # Initializing a new feature selector for each fold
        selector_inner = SelectKBest(chi2, k=1000)

        # Performing feature selection on the training set
        X_train_selected = selector_inner.fit_transform(X_train_cv, y_train_cv)
        X_dev_selected = selector_inner.transform(X_dev_cv)

        # Performing grid search to find the best hyperparameters
        grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
        grid_search.fit(X_train_selected, y_train_cv)
        best_model_inner = grid_search.best_estimator_

        # Evaluating the best model on the development set
        y_pred_dev = best_model_inner.predict(X_dev_selected)

    print("Classification Report:")
    print(classification_report(y_dev_cv, y_pred_dev))

Classification Report:
               precision    recall  f1-score   support

     business       0.94      0.94      0.94        81
entertainment       0.95      0.95      0.95        62
     politics       0.96      0.97      0.96        67
        sport       0.98      0.96      0.97        82
         tech       0.95      0.95      0.95        64

     accuracy                           0.96       356
    macro avg       0.95      0.96      0.96       356
 weighted avg       0.96      0.96      0.96       356

Classification Report:
               precision    recall  f1-score   support

     business       0.95      0.95      0.95        81
entertainment       0.98      0.92      0.95        62
     politics       0.94      0.96      0.95        67
        sport       0.94      1.00      0.97        82
         tech       0.97      0.94      0.95        64

     accuracy                           0.96       356
    macro avg       0.96      0.95      0.95       356
 weighted avg 