In [1]:
!nvidia-smi

In [2]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

from tqdm import tqdm

import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
from cuml.naive_bayes import MultinomialNB
from cuml.metrics.accuracy import accuracy_score

from sklearn.model_selection import train_test_split
#from sklearn.metrics import f1_score, accuracy_score

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

import os
from numba import cuda

In [5]:
def get_session():
    spark = SparkSession \
        .builder \
        .appName("portfolio_project") \
        .getOrCreate()
    return spark

In [6]:
spark = get_session()

In [7]:
data_url = 'https://raw.githubusercontent.com/Fryingpannn/WallStreetBets_BigDataAnalysis/main/Data/100days.csv'
spark.sparkContext.addFile(data_url)
data = spark.read.csv(SparkFiles.get('100days.csv'), header=True)
data = data.toPandas()
data.shape

In [9]:
def str_to_na(string):
    if string == 'N/A' or None:
        string = np.NaN
    return string

data['growth'] = data.growth.apply(str_to_na)

In [10]:
data.dropna(subset=['growth'], inplace=True)
data = data.reset_index(drop=True)
data['label'] = data.growth.apply(lambda x: 1 if float(x[:-1]) >= 6 else 0)
data = data[['id', 'label', 'text']]
data.head()

In [11]:
for i in tqdm(range(len(data))):
    text = data.loc[i, 'text']
    if text is not None:
        main_words = re.sub('[^a-zA-Z]', ' ', text)                                   
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]

        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 

        main_words = ' '.join(main_words)
        data.loc[i, 'out_text'] = main_words
    else:
        data.loc[i, 'out_text'] = np.NaN

data.dropna(subset=['out_text'], inplace=True)
#df = cudf.DataFrame.from_pandas(data)    

In [14]:
data = data.reset_index(drop=True)
df = cudf.DataFrame.from_pandas(data)

In [None]:
#len(max(data['out_text'], key=lambda x: len(x)))

In [None]:
model = TfidfVectorizer(max_features=None)
text_embeddings = model.fit_transform(df.out_text).toarray()

labels = df.loc[:, 'label'].values

X_train, X_test, y_train, y_test = train_test_split(text_embeddings, labels, test_size=0.2, random_state = 0)
classifier_1 = MultinomialNB()
#classifier_2 = NearestNeighbors()

classifier_1.fit(X_train, y_train)
#classifier_2.fit(X_train, y_train)

y_pred_1 = classifier_1.predict(X_test)
#y_pred_2 = classifier_2.predict(X_test)

print('MultinomialNM accuracy score {}'.format(accuracy_score(y_test, y_pred_1)))
#print('MultinomialNM f1_score score {}'.format(f1_score(y_test, y_pred_1)))
#print('NearestNeighbors accuracy score {}'.format(accuracy_score(y_test, y_pred_2)))
#print('NearestNeighbors f1_score score {}'.format(f1_score(y_test, y_pred_2)))

In [None]:
# RoBerta part 

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_type = 'xlm-roberta-base'

In [16]:
tokenizer = XLMRobertaTokenizer.from_pretrained(model_type)

In [17]:
class GetDataset(Dataset):
    def __init__(self, df, train_data=True):
        self.df = df
        self.train_data = train_data
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.loc[index, 'text']
        
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens = True,      
            max_length = 256,    # 512       
            pad_to_max_length = True,
            return_attention_mask = True,  
            return_tensors = 'pt'  
        )
        
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        if self.train_data:
            label = torch.tensor(self.df.loc[index, 'label'])
            return padded_token_list, att_mask, label
        
        return padded_token_list, att_mask

In [18]:
num_workers = os.cpu_count()
num_workers

In [43]:
df[int(df.shape[0]*0.8)]

In [48]:
test_split = int(df.shape[0]*0.8)
train_df = df[:test_split]
test_df = df[test_split:]
test_df = test_df.reset_index(drop=True)

train_data = GetDataset(train_df)
test_data = GetDataset(test_df, train_data=False)

train_loader = DataLoader(train_data, 
                          batch_size=8,
                          shuffle=True,)

test_loader = DataLoader(test_data, 
                       batch_size=8,
                       shuffle=False,)

#print(len(train_loader))
#print(len(test_loader))

In [None]:
#a, b, c = next(iter(train_loader))
#print(a.shape)

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained(model_type, num_labels = 2,)
model.to(device)

In [None]:
#cuda.select_device(0)
#cuda.close()
#cuda.select_device(0)

In [21]:
batch = next(iter(train_loader))

ind_batch = batch[0].to(device)
mask_batch = batch[1].to(device)
labels_batch = batch[2].to(device)

output = model(ind_batch, 
              attention_mask=mask_batch,
              labels=labels_batch)

output

In [22]:
preds = output[1].detach().cpu().numpy()
y_true = labels_batch.detach().cpu().numpy()
y_pred = np.argmax(preds, axis=1)

y_pred

In [23]:
val_acc = accuracy_score(y_true, y_pred)
val_acc

In [24]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
loss_values = []
num_epochs = 10

In [28]:
for ep in range(num_epochs):
    print(f'Epoch ep #{ep+1}/{num_epochs}')
    
    target_list = []
    
    model.train()
    
    torch.set_grad_enabled(True)
    total_loss = 0
    
    for i, batch in enumerate(train_loader):
        ind_batch = batch[0].to(device)
        mask_batch = batch[1].to(device)
        labels_batch = batch[2].to(device)
        
        model.zero_grad()

        outputs = model(ind_batch, attention_mask=mask_batch, labels=labels_batch)
        
        loss = outputs[0]
        total_loss += loss.item()
        optimizer.zero_grad()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
    torch.save(model, f'model_ep_{ep}.pt')
        
    print("Total loss = {}".format(total_loss))
    #model.eval()
    #torch.set_grad_enabled(False)

In [49]:
for i, batch in enumerate(test_loader):
    ind_batch = batch[0].to(device)
    mask_batch = batch[1].to(device)
    outputs = model(ind_batch,  attention_mask=mask_batch)
    
    pred = outputs[0]
    pred = pred.detach().cpu().numpy()
    
    target = labels_batch.to('cpu').numpy()
    
    target_list.extend(target)
    
    if i == 0:
        stacked_preds = pred
    else:
        stacked_preds = np.vstack([stacked_preds, pred])
           

In [50]:
preds = np.argmax(stacked_preds, axis=1)
preds.shape

In [52]:
y_true = test_df.label

In [53]:
val_acc = accuracy_score(y_true, y_pred)

In [54]:
val_acc