In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
import nltk
nltk.download('vader_lexicon')

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import pandas as pd
import textstat
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

# Load the data
df=pd.read_csv('/content/gdrive/MyDrive/DATA 606 Coding Files/news_sentiment_analysis.csv')

# Feature Engineering - VADAR and Reading Scores
df['text'] = df['Title'].fillna('') + ' ' + df['Description'].fillna('')
sid = SentimentIntensityAnalyzer()
vader_scores = df['text'].apply(lambda x: sid.polarity_scores(x))
df = pd.concat([df, vader_scores.apply(pd.Series)], axis=1)
df['flesch_kincaid_grade'] = df['text'].apply(textstat.flesch_kincaid_grade)

print(df.head())
df.describe()

# Labeling, SMOTE, and Under Sampling
le = LabelEncoder()
df['sentiment_enc'] = le.fit_transform(df['Sentiment'])
target = df[df['Sentiment']=='neutral'].shape[0]

# Prepare for sampling
X = df[['text','neg','neu','pos','compound','flesch_kincaid_grade']]
y = df['sentiment_enc'] # Use the encoded sentiment

# Oversample only the negative class up to neutral
ros = RandomOverSampler(sampling_strategy={'negative': target}, random_state=42)
# Note: ros.fit_resample expects numerical labels, so we use y which is sentiment_enc
X_ros, y_ros = ros.fit_resample(X, df['Sentiment']) # Resample using original sentiment for clarity, then re-encode

# Re-encode after resampling to ensure correct numerical labels for the models
y_ros_enc = le.transform(y_ros)

# Split data after resampling
X_train, X_test, y_train, y_test = train_test_split(
    X_ros, y_ros_enc, test_size=0.2, random_state=42, stratify=y_ros_enc # Stratify to maintain class distribution
)


# Separate text and numerical features and apply TfidfVectorizer
preprocessor = ColumnTransformer(
    transformers=[
        ('text_features', TfidfVectorizer(), 'text')
    ],
    remainder='passthrough'
)

# Define models and parameter grids
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'XGBoost'            : XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'MLP Classifier'     : MLPClassifier(max_iter=300)
}

param_grids = {
    'Logistic Regression': {
        'clf__C':      [20],
        'clf__penalty':['l2'],
        'clf__solver': ['lbfgs']
    },
    'XGBoost': {
        'clf__n_estimators': [100, 150],
        'clf__max_depth':    [15, 20, 30],
        'clf__learning_rate':[0.1]
    },
    'MLP Classifier': {
        'clf__hidden_layer_sizes': [(100,), (150,)],
        'clf__alpha':              [0.001, 0.01,0.1],
        'clf__learning_rate_init': [0.01, 0.1]
    }
}

# Grid search & evaluation
for name, estimator in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('clf',   estimator)
    ])
    grid = GridSearchCV(
        pipe,
        param_grids[name],
        cv=3,
        scoring='f1_macro',
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train, y_train)

    best = grid.best_estimator_
    print(f"\n{name}")
    print("Best hyperparameters:", grid.best_params_)

    # Train report
    y_train_pred = best.predict(X_train)
    print("\nTrain set report:")
    print(classification_report(y_train, y_train_pred, target_names=le.classes_))

    # Test report
    y_test_pred = best.predict(X_test)
    print("Test set report:")
    print(classification_report(y_test, y_test_pred, target_names=le.classes_))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


          Source          Author  \
0        stgnews  Bridger Palmer   
1  Zimbabwe Mail  Staff Reporter   
2      4-traders             NaN   
3      4-traders             NaN   
4         PLANET             NaN   

                                               Title  \
0  Pine View High teacher wins Best in State awar...   
1  Businesses Face Financial Strain Amid Liquidit...   
2  Musk donates to super pac working to elect Tru...   
4                          Rooftop solar's dark side   

                                         Description  \
0  ST. GEORGE — Kaitlyn Larson, a first-year teac...   
1  Harare, Zimbabwe – Local businesses are grappl...   
2  (marketscreener.com) Billionaire Elon Musk has...   
3  (marketscreener.com) A U.S. trade regulator on...   
4  4.5 million households in the U.S. have solar ...   

                                                 URL  \
0  https://www.stgeorgeutah.com/news/archive/2024...   
1  https://www.thezimbabwemail.com/business/busin... 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost
Best hyperparameters: {'clf__learning_rate': 0.1, 'clf__max_depth': 30, 'clf__n_estimators': 100}

Train set report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       631
     neutral       1.00      1.00      1.00       631
    positive       1.00      1.00      1.00      1707

    accuracy                           1.00      2969
   macro avg       1.00      1.00      1.00      2969
weighted avg       1.00      1.00      1.00      2969

Test set report:
              precision    recall  f1-score   support

    negative       0.96      0.99      0.97       158
     neutral       0.89      0.82      0.86       158
    positive       0.94      0.95      0.94       427

    accuracy                           0.93       743
   macro avg       0.93      0.92      0.92       743
weighted avg       0.93      0.93      0.93       743

Fitting 3 folds for each of 12 candidates, totalling 36 fits

MLP Classifier
Best hyperparamete