# **Installing Libraries**

In [None]:
!pip install --quiet transformers == 4.5.0
!pip install --quiet sentencepiece == 0.1.95
!pip install --quiet textwrap3 == 0.9.2
!pip install --quiet nltk == 3.2.5
!pip install --quiet git+https://github.com/boudinfl/pke.git@dc4d5f21e0ffe64c4df93c46146d29d1c522476b
!pip install --quiet flashtext == 2.7

# **Setup Transformer**

In [None]:
from transformers import T5ForConditionalGeneration,T5Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet as wn
from textwrap import wrap
import numpy as np
import torch
import nltk


nltk.download( 'punkt' )
nltk.download( 'brown' )
nltk.download( 'wordnet' )


summaryModel = T5ForConditionalGeneration.from_pretrained( 't5-base' )
summaryTokenizer = T5Tokenizer.from_pretrained( 't5-base' )

device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" )
summaryModel = summaryModel.to( device )

# **Summarize Text**

In [None]:
def PostProcesstext (content):
    final = ""
    for sent in sent_tokenize(content):
        sent  = sent.capitalize()
        final = final + " " + sent
    return final


def Summerize( text, model, tokenizer ):
  text = text.strip().replace( "\n", " " )
  text = "Summarization: "+ text

  max_len = 512
  encoding = tokenizer.encode_plus(text, max_length = max_len, pad_to_max_length = False, truncation = True, return_tensors = "pt" ).to( device )

  input_ids, attention_mask = encoding[ "input_ids" ], encoding[ "attention_mask" ]

  outs = model.generate(input_ids = input_ids,
                                  attention_mask = attention_mask,
                                  early_stopping = True,
                                  num_beams = 3,
                                  num_return_sequences = 1,
                                  no_repeat_ngram_size = 2,
                                  min_length = 75,
                                  max_length = 300)


  dec = [ tokenizer.decode( ids, skip_special_tokens = True ) for ids in outs ]
  summary = dec[0]
  summary = PostProcesstext(summary)
  summary = summary.strip()

  return summary


text = """How does a teacher create a test? 
Probably he will look at the material and create questions from it. 
Wirschtl-Learner works on the same principle. 
He looks at the material and creates questions from it. 
Furthermore, with Wirschtl-Learner they can summarize their material and search for keywords in it."""

summarizedText = Summerize( text, summaryModel, summaryTokenizer )


print( "\noriginal Text >>" )
for wrp in wrap( text, 150 ):
  print( wrp )
print( "\n" )

print( "Summarized Text >>" )
for wrp in wrap( summarizedText, 150 ):
  print( wrp )
print( "\n" )

# **Question generation with T5**

In [None]:
question_model     = T5ForConditionalGeneration.from_pretrained( 'ramsrigouthamg/t5_squad_v1' )
question_tokenizer = T5Tokenizer.from_pretrained( 'ramsrigouthamg/t5_squad_v1' )
question_model     = question_model.to(device)

In [6]:
def GetQuestion( context, answer, model, tokenizer ):
    text = "context: {} answer: {}".format(context,answer)
    encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

    outs = model.generate(input_ids = input_ids,
                                    attention_mask = attention_mask,
                                    early_stopping = True,
                                    num_beams = 5,
                                    num_return_sequences = 1,
                                    no_repeat_ngram_size = 2,
                                    max_length = 72)

    dec = [ tokenizer.decode(ids,skip_special_tokens = True) for ids in outs]

    question = dec[0].replace( "Question:","" )
    question= question.strip()
    return question

In [110]:
def GenerateQuestions( context, answerLength, numQuestions ):
    data = []
    sentences = context.split( "." )

    for q in range( numQuestions ):
        splittedContext = sentences[ np.random.randint( 0, len( sentences ) ) ].split( " " )

        try:
            splittedContext.remove( "\n" )
            splittedContext.remove( "." )
        except:
            pass

        while True:
            try:
                splittedContext.remove( "" )
            except:
                break

        for i in range( numQuestions ):
            limit = len( splittedContext ) - answerLength
            
            if limit <= 0:
                limit = 1
            rIndex = np.random.randint( 0, len( splittedContext ) - answerLength )
            
            answer = " "
            for _ in range( rIndex, rIndex + answerLength, 1 ):
                answer += splittedContext[ _ ] + " "
            
            ques = GetQuestion( context, answer, question_model, question_tokenizer )
            ques = ques.replace( "question: ", "" )
            data.append( { "Question": ques, "Answer": answer.capitalize() } )
    return data

def SaveQuestions( questions, fileName ):
    file = open( fileName, "w+" )
    for question in questions:
        file.writelines( question[ "Question" ] + ";" + question[ "Answer" ] )
    file.close()

In [None]:
context = """Accenture emerged on January 1, 2001, through a name change from Andersen Consulting, a management consultancy founded in 1989. 
             The company was originally an affiliate of Arthur Andersen, the accounting firm involved in the Enron bankruptcy. 
             The name change came after the company lost the rights to the Andersen name in 2000 as part of a complete breakup.
             The new name Accenture is a made-up word from Accent on the future.
             After becoming a public company, Accenture has been listed on the NYSE (New York Stock Exchange) under the symbol "ACN" since July 19, 2001. 
             The board of directors includes Wulf von Schimmelmann, the former chairman of Postbank.
             Since September 1, 2009, the Group has been operating as a public limited company under Irish law and at the same time moved its registered office from the Bermuda Islands to Dublin.
             Accenture reported fiscal 2020 earnings of about $5.19 billion on annual revenue of about $44.33 billion, representing 4% year-over-year growth. 
             Accenture's shares were trading at $278.34 apiece on April 1, 2021. 
             Accenture's market capitalization was estimated at $185.13 billion as of April 2021."""

answers = GenerateQuestions( context, 4, 4 )
print( answers )
SaveQuestions( answers, "C:/Wurschtl_AI/Questions/questions.txt" )

# **Get similar words**

In [None]:
# Jeck similarity between every word in the files
# If similarity > 0.98 two words are similar 

import gensim

searchWord   = "Benzin"
similarWords = []

allWords = []

word2vec = gensim.models.Word2Vec.load_word2vec_format( 'C:/Wurschtl_AI/model/GoogleNews-vectors-negative300.bin', binary = True )  

for word in allWords:
    sim = word2vec.similarity( searchWord, word )
    if sim > 0.98:
        similarWords.append( word )

print( f"Similar words to {0} are {1}".format( similarWords ) )