In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Let's load the dataset
df = pd.read_csv("/kaggle/input/million-headlines/abcnews-date-text.csv")

In [None]:
#Let's see a bit of the data we are dealing with
df.head()

In [None]:
#Renaming a column to better reference it in our DataFrame
df = df.rename(columns={"headline_text": "text"})
df = df.drop(columns=["publish_date"])

In [None]:
#Let's check out the types of data are in our dataframe
df.dtypes

In [None]:
#Although the DataFrame does not have labels for sentiment analysis, we can create one ourselves with huggingface
from transformers import pipeline

snmt_classifier = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
sentiment = {"sentiment": [result['label'] for result in results]}

In [None]:
sentiment

In [None]:
#We define a function to get sentiment of a piece of text
def get_sentiment_batch(examples):
    results = snmt_classifier(examples["text"])
    return {"sentiment": [result['label'] for result in results]}

In [None]:
#Convert the DataFrame to a Hugging Face Dataset
from datasets import Dataset

news_dataset = Dataset.from_pandas(df)

In [2]:
from datasets import load_from_disk

#labeled_dataset = news_dataset.map(get_sentiment_batch, batched=True)

labeled_dataset = load_from_disk("/kaggle/input/labeled-headlines/datasets/labeled", keep_in_memory=True)
labeled_dataset

FileNotFoundError: Directory /kaggle/input/labeled-headlines/datasets/labeled not found

In [None]:
#Before we do anything else, let's split our dataset into training and test 
split_dataset = labeled_dataset.train_test_split(test_size=.2)

In [None]:
#Converting back to a DataFrame to perform analysis about the dataset we labeled
split_dataset.set_format(type="pandas")
df = split_dataset["train"][:]
df.head()

In [None]:
#Let's analyze our train set to get a better glimpse of what we are dealing with
import matplotlib.pyplot as plt

df["sentiment"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
#Let's create a boxplot to see the number of words in each sentiment
df["Words Per Headline"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Headline", by="sentiment", grid=False, showfliers=False, color="Black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
#Saving our datasets for later
#labeled_dataset.save_to_disk("/kaggle/working/datasets/labeled")

In [None]:
#We are done with DataFrames, let's reset our split dataset
split_dataset.reset_format()

In [None]:
#Next, we need to tokenize our text into numerical inputs
from transformers import AutoTokenizer

model_ckpt = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
#Creating a function to tokenize a batch of text
def tokenize(batch):
    return tokenizer(batch["text"], padding=True)

In [None]:
#Map the split dataset to encode the text
headlines_encoded = split_dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
#Check out the input_ids and attention_masks we have created!
headlines_encoded

In [None]:
#Now to encode the sentiment labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(headlines_encoded["train"]["sentiment"])

In [None]:
#Let's map the encoder to our entire dataset
headlines_encoded= headlines_encoded.map(
        lambda examples: {"sentiment": label_encoder.transform(examples["sentiment"])},
        batched= True,
        batch_size = None
)

In [None]:
#Checking out some of our labeled sentiments
headlines_encoded["train"]["sentiment"][:3]

In [None]:
#Save/Load our model currently so we don't change it completely for the different methods to train our model
headlines_encoded = load_from_disk("/kaggle/working/datasets/encoded", keep_in_memory=True)

In [None]:
#Method 1: Trying to oversample the minority class with SMOTE to offset neutral imbalance
"""
from imblearn.over_sampling import SMOTE
from datasets import Dataset

#Separate the features and labels
X_train = headlines_encoded["train"]["input_ids"]
y_train = headlines_encoded["train"]["sentiment"]
attn_mask = headlines_encoded["train"]["attention_mask"]
#Initialize SMOTE
oversampler = SMOTE(sampling_strategy="minority")

#Resample our data
X_resampled, y_resampled, attn_resampled = oversampler.fit_resample(X_train, y_train)
"""

In [None]:
#Update our dataset
#resampled_df = pd.DataFrame({"input_ids": X_resampled, "sentiment": y_resampled })
#resampled_dataset = Dataset.from_pandas(resampled_df)
#headlines_encoded["train"] = resampled_dataset

In [None]:
#Now we can finally start training our text classifier
from transformers import AutoModel
import torch

model_ckpt = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
#Function to extract the last hidden states for the headlines

def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items()
                             if k in tokenizer.model_input_names}
    #Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    #Return vector for CLS token
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}

In [None]:
#Convert our dataset into torch tensors to be used in our model
headlines_encoded.set_format("torch", columns=["input_ids", "attention_mask", "sentiment"])

In [12]:
#Now we can finally extract the hidden weights to be mapped in one go
headlines_hidden = headlines_encoded.map(extract_hidden_states, batched=True, batch_size=1000)

NameError: name 'headlines_encoded' is not defined

In [2]:
#Let's save/load our hidden state dataset
from datasets import load_from_disk

#headlines_hidden.save_to_disk("/kaggle/working/datasets/hidden")
headlines_hidden = load_from_disk("/kaggle/input/labeled-headlines/datasets/hidden", keep_in_memory=True)

In [3]:
#Create a feature matrix
import numpy as np

X_train = np.array(headlines_hidden["train"]["hidden_state"])
X_test  = np.array(headlines_hidden["test"]["hidden_state"])
y_train = np.array(headlines_hidden["train"]["sentiment"])
y_test  = np.array(headlines_hidden["test"]["sentiment"])

In [5]:
X_train.shape, X_test.shape

((995347, 768), (248837, 768))

In [None]:
!pip install umap-learn

In [4]:
#Let's now train the model with logistic regression
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
#Let's see how logistic regression scores on the test set
lr_clf.score(X_test, y_test)

0.9994494387892476

In [6]:
#Using dummy classifier to establish a baseline
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

In [7]:
#Find Dummy Classifier score
dummy_clf.score(X_test, y_test)

0.7841800053850513