In [None]:
import numpy as np
import pandas as pd  
import torch 
from sklearn.datasets import fetch_20newsgroups 
import transformers
from transformers import BertTokenizer, BertModel
#import spacy
import ast 


In [None]:
pip install transformers


In [None]:
twenty_news_train= fetch_20newsgroups(remove=("headers","footers"))
twenty_news_test= fetch_20newsgroups(subset="test",remove=("headers","footers"))

In [None]:
#Step 0 Data Exploration
print(twenty_news_train.DESCR)


In [None]:
list(twenty_news_train.target_names)

Looking at the Classes, we Roughly have a few main "Categories" The articles Cover, with numerous subtopics, (that are a large part but not completely encompassing the main categories)
1. Science 
2. Religion 
3. Politics 
4. Recreation/Sports
5. Technology/Computers

In an ideal world, we would see the model. be able to seperate all these labels. 

In [None]:
# Step 1 Setting up Data Frames and Tokenizing 
base_data_train=twenty_news_train.data
base_data_test=twenty_news_test.data

In [None]:
#Tokenizing the data


#Converting the Various Corpus to a list of sentences
def article_to_sents(article):
    nlp = spacy.load('en_core_web_sm') 
    sentences = [i.text for i in nlp(article).sents]
    
    return sentences



#Takes a list of sentences and creates a list of the tokenized sentences 
#https://towardsdatascience.com/how-to-use-bert-from-the-hugging-face-transformer-library-d373a22b0209


def sents_to_tokenized_list(marked_sents): 
    tokenizer= BertTokenizer.from_pretrained('bert-base-cased')

    tokenized_list=[]
    maskingattention=[]

    #Normally, for Bert, We would tokenize it first and then take the tokenized response and encode it with the id's from the library in two seperate parts 
    #However, with encode_plus, both are done at once. Add Special tokens add the specific tokens needed that the bert model is trained for.
    for sents in marked_sents: 
        tokens=tokenizer.encode_plus(sents, add_special_tokens = True,truncation = True,  max_length = 52, padding='max_length',return_attention_mask = True, return_tensors = "pt")
        tokenized_list.append(tokens["input_ids"])
        maskingattention.append(tokens['attention_mask'])

    return tokenized_list,maskingattention,



def create_dataframe(base_data): 

    df= pd.DataFrame(base_data,columns=["Articles"])
    sentences=[] 
    tokens=[] 
    masking=[]
  
    for a in range(df.shape[0]):
        article= df.iloc[a].values[0]
        sents=article_to_sents(article)
        sentences.append(sents)
        tokenized_list, mask=sents_to_tokenized_list(sents)
        tokens.append(tokenized_list) 
        masking.append(mask)
      
      
    df["Sentences"]=sentences 
    df["Tokens"]=tokens 
    df["Attention_Masking"]=masking
    
    return df
    




In [None]:
df=create_dataframe(base_data_train)

In [None]:
#due to the incredible length it takes to run a portion of code and dataframe's do not like tensors (as it saves it as strings, tocsv_converter and csv_converter aim to help save at least the raw data beore embeddings)
def tocsv_converter(df):
    
    df=df[["Tokens","Attention_Masking"]]
    temp_tokens=[] 
    temp_masking=[]

    for a in range(df["Tokens"].shape[0]): 
       
        token=df["Tokens"][a]
        mask=df["Attention_Masking"][a]
        individual_token=[]
        individual_mask=[]
        
        for b in range(len(token)): 
            
            individual_token.append(token[b].tolist()) 
            individual_mask.append(mask[b].tolist())
            
        temp_tokens.append(individual_token)
        temp_masking.append(individual_mask) 

    df["Tokens"]=temp_tokens 
    df["Attention_Masking"]=temp_masking
    
    return df


In [None]:
def csv_converter(df):


    temp_tokens=[] 
    temp_masking=[]
    df=df[["Tokens","Attention_Masking"]] 
    for a in range(len(df)): 
        token=ast.literal_eval(df["Tokens"][a])
        mask=ast.literal_eval(df["Attention_Masking"][a])
        individual_token=[]
        individual_mask=[]
        for b in range(len(token)): 
           individual_token.append(torch.tensor((token[b])))
           individual_mask.append(torch.tensor((mask[b])))
        temp_tokens.append(individual_token)
        temp_masking.append(individual_mask)  

    
    df["Tokens"]=temp_tokens 
    df["Attention_Masking"]=temp_masking
    return(df)
    


In [None]:
#grabbing the results of the pre-proccesssing before the embeddings
df1=pd.read_csv("df.csv")
df1=csv_converter(df1)

In [None]:
#Step 2: Creating Embeddings 
#https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ 


def create_embeddings_1_sentence(df):
    
    model= BertModel.from_pretrained('bert-base-cased', output_hidden_states = True) 
    model.eval() 
    num=(df["Tokens"][0][0]).size(1) 
    
    final_outputs=[]
    for a in range(df.shape[0]):

        tokens_list=df['Tokens'][a]
        masking_list=df["Attention_Masking"][a]
        outputs_list=[]

        for b in range(len(tokens_list)):
            token=tokens_list[b]
            masking=masking_list[b]
            type(masking)
            with torch.no_grad():
                outputs = model(token,attention_mask=masking) 
            outputs_list.append(outputs)
        final_outputs.append(outputs_list) 
   
    df["output_1_sentence"]=final_outputs
    return df





In [None]:
df=create_embeddings_1_sentence(df1[0:200])


In [None]:
df.to_csv("outputs.csv")

In [None]:
def outputs_to_csv(df): 
    temp_output=[] 

    for a in range(df["output_1_sentence"].shape[0]): 
       
        article=df["output_1_sentence"][a]
        individual_output=[]

        for b in range(len(article)): 
            
            individual_token.append(article[b].tolist()) 
            
        temp_output.append(individual_output)
    

    df["output_1_sentence"]=temp_output

    
    return df


def outputs_from_csv(df): 
    temp_output=[] 

    for a in range(df["output_1_sentence"].shape[0]): 
       
        article=ast.literal_eval(df["output_1_sentence"][a])
        individual_output=[]

        for b in range(len(article)): 
            
            individual_token.append(torch.tensor((article[b])))
            
        temp_output.append(individual_output)
    

    df["output_1_sentence"]=temp_output
    
    return df

In [None]:
#Now that we have the outputs,which are the values after each of the 12 hidden layers and the output we need to to extract the word/sentence embeddings
#Lets start with sentence embeddings
#inspired by https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ though not exactely the same.

def bert_output_sentence_converter(output): 
    #output[2] is the hidden layer's final values + output of the model
    #size= [13, 1, 52, 768]
    hidden_layers_output=torch.stack(output[2], dim=0) 
   
    # I borrowed my Extraction decision from the link below, to take the last four hidden layers and get the sum of them.
    # check out http://jalammar.github.io/illustrated-bert/ for an intresting study on which hidden model we should extract from
    final_four_layer_word_embeddings=hidden_layers_output[-4:][0] 

    sum_of_four_layers=torch.sum(torch.stack([final_four_layer_word_embeddings]), dim=0)
    individual_word_embeddings = torch.squeeze(sum_of_four_layers, dim=0)

    #to get a "sentence embedding" out of the word embeddings, I will simply get the average of all the word embeddings for each sentence
    sentence_embeddings=torch.mean(individual_word_embeddings , dim=0)
    
    return sentence_embeddings


bert_sentence_embeddings=[]
for a in range(df.shape[0]): 
    article_output=df["output_1_sentence"][a]
    sentence_embeddings=[]

    for b in range(len(article_output)):
        sentence_output=article_output[b]
        #takes sentence and calculated sentence embeddings
        embedding=bert_output_sentence_converter(sentence_output)
        sentence_embeddings.append(embedding)

    bert_sentence_embeddings.append(sentence_embeddings)



In [None]:
print(len(bert_sentence_embeddings)) 
print(bert_sentence_embeddings[0][1].size())
#size of bert size embeddings (#number of article numbers, #number of sentences per ,# word)

In [None]:
dfwf=1

In [None]:
#Word Embeddings will be similar to Sentence embeddings

#This is inspired by https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ though how the word embedding itself is created is different
def bert_output_word_converter(output): 
    #output[2] is the hidden layer's final values + output of the model
    #size= [13, 1, 52, 768]
    hidden_layers_output=torch.stack(output[2], dim=0) 
   
    #normally, bert will have a batches (the [1] dimension), but since we have one, lets squeeze it out
      #size= [13, 52, 768]
    hidden_layers_output = torch.squeeze(hidden_layers_output, dim=1)
    
    # we now have a tensor with the right dimensions, but it would be easier if we could go word by word rather than layer by layer 
    #size[52,12,768]
    hidden_layers_output = hidden_layers_output.permute(1,0,2)

    
    # I borrowed my Extraction decision from the link below, to take the last four hidden layers and get the sum of them 
    # check out http://jalammar.github.io/illustrated-bert/ for an intresting study on which hidden model we should extract from
   

    word_embeddings=[]
    for word in hidden_layers_output:
        
        sum_of_hidden=torch.sum(torch.stack([word[-4],word[-3],word[-2],word[-1]]), dim=0)
        word_embeddings.append(sum_of_hidden)

    return word_embeddings


bert_word_embeddings=[]
for a in range(df.shape[0]): 
    article_output=df["output_1_sentence"][a]
    word_embeddings=[]

    for b in range(len(article_output)):
        sentence_output=article_output[b]
        #takes a sentence and converts it into its word embeddings
        embedding=bert_output_word_converter(sentence_output)
        word_embeddings.append(embedding)
        

    bert_word_embeddings.append(word_embeddings)



In [None]:
#now that we have a tf_idf, matrix, we notice that there is a problem with sizing? For the sake of this project, for the tf_idf, we will 
# we will pool tf_idf function with a linear layer

In [None]:
#Step 3
#Transformer While we do have already established "Groupings"from the 20 Newsgroup such as on Cars or certain Sports, with the massive amount of data, 
# I want to look at if Sentemce Embeddings pulled from bert works significantly better than lets say a simple TF-IDF Vector as a representation of the data in grouping these articles without a 
#Label

class Model(torch.nn.Module):
    def __init__(self,input_length=768,lstm_hidden_size=768,sequence_length=1,num_of_classes=20):
        super().__init__()
        ##YOUR CODE HERE##
        
        
        self.input_length=input_length 
        self.lstm_hidden_size=lstm_hidden_size 
        self.sequence_length=sequence_length
        self.num_of_classes=num_of_classes
     

        #Step 1 RNN/lstm

        self.rnn= torch.nn.RNN(input_size=768,hidden_size=128,num_layers=1)
        
        #Step two Relu layers 
        self.relu_layer_1=torch.nn.Linear(lstm_hidden_size,128)
        self.relu_layer_2=torch.nn.Linear(128,64)
        
        #Step 3 linear layer
        self.linear_layer_1=torch.nn.Linear(64,num_of_classes)

        #Step 4 Softmax layers 
        self.soft_max=torch.nn.Softmax(dim=2)


    def forward(self, x):
        
        h0 = torch.randn(1,1,self.lstm_hidden_size).requires_grad_()
        
        # Initialize cell state
        
        #Step 1 RNN/LSTM 
        x,hn =self.rnn(x,h0)
      
        if self.sequence_length>1: 
            x=torch.sum(x,dim=0)
        
        #Step two Relu dense layers
        
        y=self.relu_layer_1(x)
        y=torch.nn.functional.relu(y) 
        y=self.relu_layer_2(y) 
        y=torch.nn.functional.relu(y)
        #Step 3 linear layers 
        y=self.linear_layer_1(y) 

        #Step 4 Softmax layer
        predictions=self.soft_max(y)
      
              
        return(predictions) 



In [None]:
#the idea of my loss function is that  there needs to be away to "reward" the model if it predicts 
# a similar category (i.e if the class related to atheism was mistaken for an article on christianity, there should be some difference compared to if it was predicted as a basketball article)

def individual_loss_function(predictions,reality):
    #religion
    key=[[0,15,18],[1,2,3,4,5],[6],[7,8,9,10],[11,12,13,14],[16,17,18]]
    


    predictions=predictions.squeeze(dim=0)
    predictions=predictions.squeeze(dim=0)
    #for loss one, i am only considering two things. If it is in the right group or not (a binary classifcation)
 
    #checks if it got in the "ballpark" categories
    pos_prob=0
    for group in range(len(key)): 
        
        if reality in key[group]:
            for a in range(len(key[group])):
        
                pos_prob+=predictions[a]

    pos_prob=torch.tensor(pos_prob)         
    loss1=-1*torch.log(pos_prob)
    #general cross loss function
    loss2=-1*torch.log(predictions[reality])
    loss=loss1+loss2
    
    return loss
    
def run_model_sentence(bert_sentence_embeddings,epoch=100):
    
    learning_rate=.05
    epoch=epoch
    
    article_realities=twenty_news_train.target[0:len(bert_sentence_embeddings)]
  
    
    sentence_model=Model(input_length=768,lstm_hidden_size=128)
    optimizer = torch.optim.SGD(sentence_model.parameters(), lr=learning_rate) 
    final_predictions=[]
    for epochs in range(epoch):
        for a in range(len(bert_sentence_embeddings)): 
            
            article=bert_sentence_embeddings[a]
            article_reality=article_realities[a]
            prediction_list=[] 
            for b in range(len(article)):
                
                optimizer.zero_grad()
                x=article[b]
                x=x.unsqueeze(dim=0)
                x=x.unsqueeze(dim=0)
               
                predictions=sentence_model(x)
               
                loss= individual_loss_function(predictions,article_reality)

                loss.backward() 
                optimizer.step()
                
                if epochs==(epoch-1):
                    prediction=predictions.squeeze(dim=0) 
                    prediction=prediction.squeeze(dim=0) 
                    prediction=torch.argmax(prediction)
                    sent_prediction=predictions.argmax()
                    prediction_list.append(sent_prediction.item())
                  
            
            if epochs==(epoch-1):
                article_prediction=max(set(prediction_list), key=prediction_list.count)
                final_predictions.append(article_prediction)
                
        if epochs%20==0: 
            print(loss)
       
    
    return(sentence_model,final_predictions)
        
        
        
def run_model_word(bert_word_embeddings,epoch=100): 

    
    learning_rate=.05
    epoch=epoch
    
    article_realities=twenty_news_train.target[0:len(bert_word_embeddings)]
    word_model=Model(input_length=768,lstm_hidden_size=128)
    optimizer = torch.optim.SGD(word_model.parameters(), lr=learning_rate) 
    final_predictions=[]
    for epochs in range(epoch):
        
        for a in range(len(bert_word_embeddings)): 
            
            article=bert_word_embeddings[a]
            article_reality=article_realities[a]
            prediction_list=[] 
            for b in range(len(article)):
                
                optimizer.zero_grad()
                
                #sentence is a list of tensors (52,768)
                sentence=article[b]
                #lets convert to a tensor of lists
                #(52,768)
                sentence=torch.stack(sentence[:])
        

                #x is a (1,52,768)
                x=sentence.unsqueeze(dim=0)
                #x= (52,1,769)
                x=x.permute(1,0,2)
             
               
                predictions=word_model(x)
                
                
                loss= individual_loss_function(predictions,article_reality) 
                loss=loss[0].sum()/20
    
                loss.backward() 
                optimizer.step()
                
                if epochs==(epoch-1):
                    prediction=predictions[51]
                    prediction=prediction.squeeze(dim=0) 
                    word_prediction=torch.argmax(prediction) 
                    prediction_list.append(word_prediction.item())


            if epochs==(epoch-1):
                
                article_prediction=max(set(prediction_list), key=prediction_list.count)
                final_predictions.append(article_prediction) 
        
        if epochs%20==0: 
            print(loss)  
    
            
            
                
    
    
    return(word_model,final_predictions)




In [None]:
#Step 4
#Now how do we determine success? lets look at some insights

def precise_confusion_matrix(actual,prediction_orig,length):
    
    list_of_predictions=[]
    for a in range(length):
        prediction=prediction_orig[a]
        list_of_predictions.append(prediction) 
       
    #grabbed from https://stackoverflow.com/questions/38877301/how-to-calculate-accuracy-based-on-two-lists-python
    list_of_predictions=np.array(list_of_predictions)
    actual=np.array(actual)
    correct = (list_of_predictions == actual)
    accuracy = float(correct.sum()/length)
    print(accuracy)
    
    
    
def general_confusion_matrix(actual,prediction_orig,length): 
    key=[[0,15,18],[1,2,3,4,5],[6],[7,8,9,10],[11,12,13,14],[16,17,18]]
    
    list_of_predictions=[]
    for a in range(length):
        prediction=prediction_orig[a]
        list_of_predictions.append(prediction)
    
    for b in range(length): 
        for group in range(len(key)): 
            if list_of_predictions[b] in key[group]: 
                list_of_predictions[b]=group
            if actual[b] in key[group]: 
                actual[b]=group      
   

    list_of_predictions=np.array(list_of_predictions)
    actual=np.array(actual)
    correct = (list_of_predictions == actual)  
    accuracy=float(correct.sum()/length)
    print(accuracy)

    
     
    
   
