In [2]:
# First let import the most necessary libs
import pandas as pd
import numpy as np

import openai as openai
import math
import time
import matplotlib.pyplot as plt
import whisper

# Library to import pre-trained model for sentence embeddings
from sentence_transformers import SentenceTransformer

# Calculate similarities between sentences
from sklearn.metrics.pairwise import cosine_similarity

# Visualization library
import seaborn as sns
import matplotlib.pyplot as plt

# package for finding local minimas
from scipy.signal import argrelextrema


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#this code takes in the audio file and writes it to a txt file. 

def GetTranscriptFromAudio(path, audiofilename, type):

    # Setup, define, and run Whisper
    model = whisper.load_model("base")
    input = path + audiofilename + type
    output = audiofilename + ".txt"
    
    with open(path + output, 'w') as file:
     file.write(model.transcribe(input)["text"])


In [4]:
# This function takes the text input from the youtube video and uses GPT3 to turn it into a cohesive 
# series of thoughts. Those can later be turned into a twitter thread


#a bunch of code from This guy -> https://medium.com/@npolovinkin/how-to-chunk-text-into-paragraphs-using-python-8ae66be38ea6
def rev_sigmoid(x:float)->float:
    return (1 / (1 + math.exp(0.5*x)))
    
def activate_similarities(similarities:np.array, p_size=10)->np.array:
    """ Function returns list of weighted sums of activated sentence similarities

    Args:
        similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
        p_size (int): number of sentences are used to calculate weighted sum 

    Returns:
        list: list of weighted sums
    """
    # To create weights for sigmoid function we first have to create space. P_size will determine number of sentences used and the size of weights vector.
    x = np.linspace(-10,10,p_size)
    # Then we need to apply activation function to the created space
    y = np.vectorize(rev_sigmoid) 
        # Because we only apply activation to p_size number of sentences we have to add zeros to neglect the effect of every additional sentence and to match the length ofvector we will multiply
    activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
    ### 1. Take each diagonal to the right of the main diagonal
    diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
    ### 2. Pad each diagonal by zeros at the end. Because each diagonal is different length we should pad it with zeros at the end
    diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
    ### 3. Stack those diagonals into new matrix
    diagonals = np.stack(diagonals)
    ### 4. Apply activation weights to each row. Multiply similarities with our activation.
    diagonals = diagonals * activation_weights.reshape(-1,1)
    ### 5. Calculate the weighted sum of activated similarities
    activated_similarities = np.sum(diagonals, axis=0)
    return activated_similarities
  

def CreateModularContent(path, transcript, sentencetransformer):
    #reading the desired file
    with open(path+transcript + ".txt", 'r') as file:
     contents = file.read()
    
    contents = contents.replace("?", ".")
    
    #separating the file into an array based on when there are periods. 
    list_of_contents = contents.split(".")
    
    # Get the length of each sentence
    sentence_length = [len(each) for each in list_of_contents]
    
    
    # Determine longest outlier
    long = np.mean(sentence_length) + np.std(sentence_length) *2
    
    # Determine shortest outlier
    short = 20 #np.mean(sentence_length) - np.std(sentence_length) *2
    
    
    # Shorten long sentences
    text = ''
    for each in list_of_contents:
        if len(each) > long:
            # let's replace all the commas with dots
            each.replace(',', '.') 
        else:
            text+= f'{each}.'
    count = 0        
    list_of_contents = text.split('.')
    
    # Now let's concatenate short ones
    text = ''
    
    for each in list_of_contents:
        if len(each) < short:
            text+= f'{each}'
        else:
            text+= f'{each}.'
    
    list_of_contents = text.split(".")
    
    
    embeddings = sentencetransformer.encode(list_of_contents)

        
    # Create similarities matrix
    similarities = cosine_similarity(embeddings)
    
    # Lets apply activated_similarities. For long sentences i reccomend to use 10 or more sentences (not sure what p_size does)
    activated_similarities = activate_similarities(similarities, p_size=similarities.shape[0])

    ### 6. Find relative minima of our vector. For all local minimas and save them to variable with argrelextrema function
    minmimas = argrelextrema(activated_similarities, np.less, order=2) #order parameter controls how frequent should be splits. I would not reccomend changing this parameter.
    # plot the flow of our text with activated similarities

    #Get the order number of the sentences which are in splitting points
    split_points = [each for each in minmimas[0]]
   
   # Create empty string
    text = ''
    for num,each in enumerate(list_of_contents):
        # Check if sentence is a minima (splitting point)
        if num in split_points:
            # If it is than add a dot to the end of the sentence and a paragraph before it.
            text+=f'\n {each}.'
        else:
            # If it is a normal sentence just add a dot to the end and keep adding sentences.
            text+=f'{each}. '
   
    with open(path + transcript + "_modular" + ".txt", 'w') as f:
        f.write(text)

    return transcript + "_modular"   



In [5]:
def aggregate(raw_file, agg):
    thought = raw_file.readline()
    for i in range(agg-1):
        if raw_file.readline() == "": 
            agg += 1
        else:      
            thought += raw_file.readline()
    return thought

def SummarizeThoughts(path, filename, openai, agg, iterations): 
# Imports GPT3 model. Using davinci at the moment for final outputs. Curie for testing. 

    #Wondering if we can retrieve the model earlier on -> so we don't have to do this multiple times. 
    #openai.Model.retrieve("text-curie-001")
    openai.Model.retrieve("text-davinci-002")
    limcurie = 2040
    limdavinci = 2040

    #File management. 
    cleaned_file = open(path + filename + "_" + str(iterations) + ".txt", "w")
    raw_file = open(path+filename + ".txt")
    thought = aggregate(raw_file, agg)
    counter = 0
    
    
    # structures the base prompt for the model
    #TO BE UPDATED. I want to train my own version of this. 
    #base_prompt = "Paragraph:So yeah, do you see in those ecosystems really cool as pop in? Lots of cool projects, many more I forgot a bunch, but yeah, Jocelyn is always curating this cool landscape, so just check it out. I have the Twitter right there. And yeah, so we just heard about it. So sharing scientific data is super important. Why? Because, well, if we share data, we can collaborate much more easily. We can build bigger data sets and bigger data sets means more statistical power, reliable results, right? So that's pretty cool. And it also means more access to the data that, so there's not the same access to cool instruments that help you with data collection across labs. So if you're in an underfunded research institution, you just may not have the ability to collect the same type of data that a well-funded institution may have. So if we all share data, we all have better access to make cool scientific discoveries. So that's pretty cool, right? But also sharing scientific data right now. It's pretty expensive, it's pretty vulnerable because it's stored on centralized databases where we just have to trust that they keep the database running. It's also not rewarded. So currently, what counts in science is having your PDF cited, but it doesn't matter if you make your data accessible, like you just cannot accrue credit to it. Or there's some ways you can, but it's just not really easy. And it's also pretty painful. So there's a couple of repos out there where you can store your data. These are funded by some governmental institutions. There you access not great. And then also, if you want to find the data, you need to know which repo it's stored at. So you need to find the repo. Then you need to find the data. It's all, it's a hassle, so it's not great.\nExample Summary:Sharing scientific data is important as it allows for better collaboration, bigger data sets, reliable results, and better access for researchers in underfunded institutions. However, currently sharing data is expensive, vulnerable, and not rewarded. It is stored on centralized databases which requires that we trust those servers to keep running. Also, there are no incentives for for making the data accessible. Currently, the only way that we can give credit for using someone else's work is citing their PDF. But with PDF citations, it doesn't matter if you make your data accessible. Sharing data right now isn't worth the cost and time for the researcher.\nParagraph:"

    base_prompt = "Paragraph:In particular the last five years has been full time at this organization called the GoFair Foundation. These slides are available. differences that are in here too. Okay, so what is this idea going fair and doing fair? So this presentation will be in two parts and the first part going fair is really about kind of the history of where this idea came from about fair and kind of how it's matured up until today and then in the next section on doing fair, just to kind of give you a snapshot of some state of the art developments right now on the implementation of the fair principles and kind of, you know, the doors that opens up to kind of a new way of looking at computing and a new way of doing information exchange and being able to access larger amounts of data for kind of, you know, distributed learning type analyses.\nSummary:What is going and doing fair? We’ll break this down into two parts - first we’ll explore the history of fair and how it has matured until today, then we’ll explore the implementation of the Fair principles as a novel method of computing, information exchange, and distributed learning analysis.\nParagraph:Okay, so I'm going to start here with a picture from, that I took in 2019 in the gentleman at the podium, his name is George Straun and in 1995, he was the director of something called the NSF Net, the National Science Foundation Network and would George is saying at this, in this presentation here is that in 2019, the internet was now 50 years old. So in particular, what he's really referring to is that this key technology of the modern internet, TCP IP was invented in 1969 and underwent a lot of research and development for 20 years. Then there was another 10 years where TCP IP was used to implement the NSF Net and the goal of the project was to connect, I think, 100 American universities to the national supercomputing centers that had grown up in the United States. And it's kind of strange to think about it now, but just the idea that there were local computing networks and if you wanted to interconnect them, that that was a real engineering problem. It was interoperability of the networks. And so this TCP IP was helping to create what they call the internet, the interoperable network. But it was, you know, at that time, a real engineering problem, right?\nSummary:In 1995, George Straun was the director of the National Science Foundation Network (NSF Net). And the goal of the NSF at that time was to connect 100 American Universities to the national supercomputing centers. Now, while this seems simple now the key technology of the internet, TCP IP, was invented just 26 years earlier in 1969, and finding a way to connect many local networks running on this invention was a huge engineering challenge. They were asking the question, how do we make a global internet? An interoperable network.\nParagraph:"
    while thought:
        # Clean off white space for OpenAI
        thought=thought.strip()
        #if len(thought) > limcurie: 
            #split the thought preferebly. 
                #

        # Prepare full prompt
        p = base_prompt + thought + "\nSummary:"

        # Model parameters were determined through sandbox testing. Temp is fairly high to allow the model
        response = openai.Completion.create(
            #note on this -> I'm suspicious -> why are we defining this twice? Play around with cleaning this.
            #model = "text-curie-001",
            model="text-davinci-002",
            prompt = p,
            max_tokens=400,
            temperature=0.7,
            top_p=1,
            frequency_penalty=0.5,
            presence_penalty=0.5
        ) 

        # answer logging and .txt formatting
        cleaned_file.write(response["choices"][0]["text"] + "\n")

        
        counter+=1
        print(counter)
        # A sleep counter because microsoft keeps limiting my creativity
        if counter%30==0 and counter!=0:
            print("\n\n\nI am so sleepy\n\n\n")
            time.sleep(60)
        
        
        thought = aggregate(raw_file, agg)

    cleaned_file.close()
    raw_file.close()
    
    #iterater
    if iterations > 1: 
        filename = filename + "_" + str(iterations)
        SummarizeThoughts(path, filename, openai, 3, iterations-1)



In [6]:
#This script should take you from an audio output to a TLDR type medium article in the 
#DeSci Foundation voice. 

#define the path you want materials to be saved too
#export OPENAI_API_KEY=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

openai.api_key = "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"
path = "/Users/desot1/Documents/GitHub/DeSci-Experiments/"
model = SentenceTransformer('all-MiniLM-L6-v2')
audiofile = "BertAndErikConversationRecording"
type = ".m4a"

#openai api fine_tunes.create -t <train_file>


#Step one is to transcribe the audio using Whisper. 
GetTranscriptFromAudio(path, audiofile, type)

#Step two is to take the transcript and turn it into paragraphs
modular = CreateModularContent(path, audiofile, model)

#Step three is to take the modular content and summarize each paragraph 
SummarizeThoughts(path, modular, openai, 1, 2)



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30



I am so sleepy



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
