In [1]:
!pip install nltk



In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [3]:
df = pd.read_excel('Dataset_BA1B.xlsx')

In [4]:
df.columns

Index(['CORE', 'SECTION', 'SECTION_UNIT', '% of fractures', 'IMAGES',
       'SEGMENTATION', 'TOP_DEPTH', 'UNIT_DESC4', 'VEIN_INTENSITY',
       'ALTERATION', 'REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',
       'UNIT_DESC5', 'UNIT_DESC3', 'UNIT_TYPE_Dunite', 'UNIT_TYPE_Fault rock',
       'UNIT_TYPE_Gabbro', 'UNIT_TYPE_Harzburgite', 'UNIT_TYPE_Metagabbro',
       'UNIT_TYPE_Other', 'UNIT_CLASS_OPHIO', 'UNIT_CLASS_UND',
       'TEXTURES_Brecciated', 'TEXTURES_Sheared',
       'GRAINSIZE_Cryptocrystalline', 'GRAINSIZE_Fine grained',
       'GRAINSIZE_Medium grained', 'GRAINSIZE_Microcrystalline',
       'GRAINSIZE2_Coarse grained', 'GRAINSIZE2_Cryptocrystalline',
       'GRAINSIZE2_Fine grained', 'GRAINSIZE2_Medium grained',
       'GRAINSIZE2_Pegmatitic'],
      dtype='object')

In [5]:
df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]] = df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]].fillna('')
df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]] = df[['REMARKS1', 'REMARKS2', 'REMARKS4', 'REMARKS5',]].replace('none', '')

In [6]:
df['REMARKS_ALL'] = (df['REMARKS1'] + ' ' + df['REMARKS2'] + ' ' + df['REMARKS4'] + ' ' + df['REMARKS5'])

In [7]:
df['REMARKS_ALL']

0      microcrystalline carbonate visible on vein sur...
1      Irregular network of black serp veins (2-5 mm ...
2      Serp mesh texture cross-cut by later, < 0.2 mm...
3        MICROBIO SAMPLE. Serp dunite with a microgab...
4      mesh texture of black veins cross-cut by 0.5 m...
                             ...                        
650      Black serp hzb finishing by a px-poor zone b...
651      Black serp hzb starting with px-poor zone as...
652      Black serp hzb with very altered thin pxnite...
653      Black serp dunite with a offset microgabbroi...
654      Very altered microgabbro with white veins cr...
Name: REMARKS_ALL, Length: 655, dtype: object

In [8]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [9]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [10]:
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()

In [11]:
onestr = ','.join(df.REMARKS_ALL.values).lower()

words = nltk.tokenize.word_tokenize(onestr)

words = [word for word in words if word.casefold() not in stop_words]
words = [lemmatizer.lemmatize(word) for word in words]

In [12]:
finder = BigramCollocationFinder.from_words(words)

In [13]:
frequency_distribution = FreqDist(words)

In [14]:
finder.apply_freq_filter(10)

In [15]:
finder.nbest(fourgram_measures.pmi, 100)

[('smell', 'sulphur'),
 ('shape', 'preferred'),
 ('preferred', 'orientation'),
 ('euhedral', 'calcite'),
 ('high', 'density'),
 ('appearance', 'masked'),
 ('microbio', 'sample'),
 ('crystal', 'shape'),
 ('strong', 'crystal'),
 ('mesh', 'texture'),
 ('angular', 'clast'),
 ('cross', 'cut'),
 ('coalescence', 'multiple'),
 ('forming', 'coalescence'),
 ('clay', 'mineral'),
 ('faint', 'lineation'),
 ('variable', 'thickness'),
 ('cutting', 'sample'),
 ('broken', 'surface'),
 ('masked', 'striation'),
 ('striation', 'caused'),
 ('medium', 'grained'),
 ('deg', 'intersection'),
 ('sigmoidal', 'veinlets'),
 ('lineation', 'slickensides'),
 ('variable', 'amount'),
 ('degree', 'dip'),
 ('core', 'barrel'),
 ('90', 'deg'),
 ('caused', 'core'),
 ('60', 'degree'),
 ('coarse', 'grain'),
 ('harzb', 'w/'),
 ('magmatic', 'intrusion'),
 ('open', 'crack'),
 ('amount', 'px'),
 ('coarse', 'grained'),
 ('alteration', 'halo'),
 ('highly', 'disrupted'),
 ('90', 'degree'),
 ('thin', 'pyroxenite'),
 ('massive', 'harz

In [16]:
# nltk.download('genesis')
# nltk.corpus.genesis.words('english-web.txt')

In [17]:
# nltk.download('punkt')
# nltk.tokenize.word_tokenize(','.join(df.REMARKS_ALL.values))

In [18]:
# nltk.download("stopwords")

In [19]:
# nltk.download('wordnet')

In [20]:
# nltk.download('omw-1.4')