In [40]:
!pip install nltk
!pip install tenacity



In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [3]:
df = pd.read_excel('Dataset_BA1B.xlsx')

In [4]:
df.columns

Index(['CORE', 'SECTION', 'SECTION_UNIT', '% of fractures', 'IMAGES',
       'SEGMENTATION', 'TOP_DEPTH', 'UNIT_DESC4', 'VEIN_INTENSITY',
       'ALTERATION', 'REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',
       'UNIT_DESC5', 'UNIT_DESC3', 'UNIT_TYPE_Dunite', 'UNIT_TYPE_Fault rock',
       'UNIT_TYPE_Gabbro', 'UNIT_TYPE_Harzburgite', 'UNIT_TYPE_Metagabbro',
       'UNIT_TYPE_Other', 'UNIT_CLASS_OPHIO', 'UNIT_CLASS_UND',
       'TEXTURES_Brecciated', 'TEXTURES_Sheared',
       'GRAINSIZE_Cryptocrystalline', 'GRAINSIZE_Fine grained',
       'GRAINSIZE_Medium grained', 'GRAINSIZE_Microcrystalline',
       'GRAINSIZE2_Coarse grained', 'GRAINSIZE2_Cryptocrystalline',
       'GRAINSIZE2_Fine grained', 'GRAINSIZE2_Medium grained',
       'GRAINSIZE2_Pegmatitic'],
      dtype='object')

In [5]:
df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]] = df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]].fillna('')
df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]] = df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]].replace('none', '')

In [6]:
df['REMARKS_ALL'] = (df['REMARKS1'] + ' ' + df['REMARKS2'] + ' ' + df['REMARKS4'] + ' ' + df['REMARKS5'])

In [7]:
df['REMARKS_ALL']

0      microcrystalline carbonate visible on vein sur...
1      Irregular network of black serp veins (2-5 mm ...
2      Serp mesh texture cross-cut by later, < 0.2 mm...
3        MICROBIO SAMPLE. Serp dunite with a microgab...
4      mesh texture of black veins cross-cut by 0.5 m...
                             ...                        
650      Black serp hzb finishing by a px-poor zone b...
651      Black serp hzb starting with px-poor zone as...
652      Black serp hzb with very altered thin pxnite...
653      Black serp dunite with a offset microgabbroi...
654      Very altered microgabbro with white veins cr...
Name: REMARKS_ALL, Length: 655, dtype: object

In [8]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [9]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [10]:
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()

In [11]:
onestr = ','.join(df.REMARKS_ALL.values).lower()

words = nltk.tokenize.word_tokenize(onestr)

words = [word for word in words if word.casefold() not in stop_words]
words = [lemmatizer.lemmatize(word) for word in words]

In [12]:
finder = BigramCollocationFinder.from_words(words)

In [13]:
frequency_distribution = FreqDist(words)

In [14]:
finder.apply_freq_filter(10)

In [15]:
finder.nbest(fourgram_measures.pmi, 100)

[('smell', 'sulphur'),
 ('shape', 'preferred'),
 ('preferred', 'orientation'),
 ('euhedral', 'calcite'),
 ('high', 'density'),
 ('appearance', 'masked'),
 ('microbio', 'sample'),
 ('crystal', 'shape'),
 ('strong', 'crystal'),
 ('mesh', 'texture'),
 ('angular', 'clast'),
 ('cross', 'cut'),
 ('coalescence', 'multiple'),
 ('forming', 'coalescence'),
 ('clay', 'mineral'),
 ('faint', 'lineation'),
 ('variable', 'thickness'),
 ('cutting', 'sample'),
 ('broken', 'surface'),
 ('masked', 'striation'),
 ('striation', 'caused'),
 ('medium', 'grained'),
 ('deg', 'intersection'),
 ('sigmoidal', 'veinlets'),
 ('lineation', 'slickensides'),
 ('variable', 'amount'),
 ('degree', 'dip'),
 ('core', 'barrel'),
 ('90', 'deg'),
 ('caused', 'core'),
 ('60', 'degree'),
 ('coarse', 'grain'),
 ('harzb', 'w/'),
 ('magmatic', 'intrusion'),
 ('open', 'crack'),
 ('amount', 'px'),
 ('coarse', 'grained'),
 ('alteration', 'halo'),
 ('highly', 'disrupted'),
 ('90', 'degree'),
 ('thin', 'pyroxenite'),
 ('massive', 'harz

In [16]:
# nltk.download('genesis')
# nltk.corpus.genesis.words('english-web.txt')

In [17]:
# nltk.download('punkt')
# nltk.tokenize.word_tokenize(','.join(df.REMARKS_ALL.values))

In [18]:
# nltk.download("stopwords")

In [19]:
# nltk.download('wordnet')

In [20]:
# nltk.download('omw-1.4')

In [21]:
from summa.summarizer import summarize

In [22]:
# summarize(df['REMARKS_ALL'], words=5)

In [23]:
summa_data = '\n'.join([row[1][0] for row in df[['REMARKS_ALL']].iterrows()])

In [24]:
summarize(summa_data, words=5)

''

In [25]:
summarize("""what is going on i have no idea the quick brown fox jumped over the lazy dog i love star trek
everything is complicated nobody is living everyone is dead i just want to eat beans
eating beans is a complex activity that requires a strong ability to digest legumes"""
          ,split=True)

[]

In [58]:
# with open('api-key.txt') as f:
#     openai_api_key = f.readline()

In [59]:
import openai
import ast

with open('api-key.txt') as f:
    openai.api_key = f.readline()

In [62]:
# https://tenacity.readthedocs.io/en/latest/api.html#tenacity.wait.wait_exponential
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def summarize(text, debug=False):
    prompt = f"Please summarize the following text into ten keywords and explain why you picked each key word. The text to summarize is:{text}"
    
    if debug == True:
        print(prompt)
        
    return openai.ChatCompletion.create(
             model='gpt-3.5-turbo'
            ,messages=[{'role':'user', 'content':prompt},]
            # prompt=(f"Please summarize the following text into ten keywords and explain why you picked each key word. The text to summarize is:\n{text}\n\nSummary:"),
            # temperature=0.5,
            # max_tokens=1024,
            # n = 1,
            # stop=None
        )

In [63]:
df['REMARKS_ALL'][0]

'microcrystalline carbonate visible on vein surfaces. clasts are 90% angular and 10% rounded. This indicates that the thickness of alluvium is < few 10s cm and the bedrock is surfacing. angular fragments at 0 to 60 cm with mixed lithologies varying from serpentinised harzburgite to dunite. Serpentinization, oxidation, carbonation in veins'

In [65]:
summed = summarize(df['REMARKS_ALL'][0], debug=True)

Please summarize the following text into ten keywords and explain why you picked each key word. The text to summarize is:microcrystalline carbonate visible on vein surfaces. clasts are 90% angular and 10% rounded. This indicates that the thickness of alluvium is < few 10s cm and the bedrock is surfacing. angular fragments at 0 to 60 cm with mixed lithologies varying from serpentinised harzburgite to dunite. Serpentinization, oxidation, carbonation in veins


In [66]:
print(summed['choices'][0]['message']['content'])

1. Microcrystalline carbonate - this is the main mineral visible on the vein surfaces.
2. Clasts - these are rock fragments that make up the alluvium, with 90% being angular.
3. Rounded - only 10% of the clasts are rounded, indicating recent weathering.
4. Thickness - the thickness of the alluvium is less than a few tens of centimeters.
5. Bedrock - the bedrock is starting to surface.
6. Angular fragments - these are found at depths of 0-60cm and are composed of various lithologies.
7. Serpentinised harzburgite - a type of rock found in the angular fragments.
8. Dunite - another type of rock found in the angular fragments.
9. Serpentinization - a process that ties into the formation of serpentinised harzburgite.
10. Oxidation and carbonation - processes that affect the formation of the microcrystalline carbonate. 

I picked these keywords because they describe key aspects of the geological features present in the area being discussed. They give an overall picture of the type of rocks a

In [32]:
# summed2 = summarize(df['REMARKS_ALL'][0])
# print(summed2['choices'][0]['message']['content'])

In [33]:
# summed100 = summarize(df['REMARKS_ALL'][100])
# print(summed100['choices'][0]['message']['content'])

In [34]:
# summed200 = summarize(df['REMARKS_ALL'][200])
# print(summed200['choices'][0]['message']['content'])

In [35]:
# summed300 = summarize(df['REMARKS_ALL'][300])
# print(summed300['choices'][0]['message']['content'])

In [36]:
# summed400 = summarize(df['REMARKS_ALL'][400])
# print(summed400['choices'][0]['message']['content'])

In [37]:
# summed500 = summarize(df['REMARKS_ALL'][500])
# print(summed500['choices'][0]['message']['content'])

In [38]:
# summed600 = summarize(df['REMARKS_ALL'][600])
# print(summed600['choices'][0]['message']['content'])

In [39]:
text_summary = []

for row in df.iterrows():
    remarks = row[1]['REMARKS_ALL']
    summed = summarize(remarks)
    text_summary.append(summed)

RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID adab90942be60454ddbce2d38e0c7ce9 in your message.)

In [None]:
print(text_summary[0])