In [17]:
import pandas
# load the dataset
dataset = pandas.read_csv('papers2.csv')
dataset.head()

Unnamed: 0,id,year,title,abstract
0,1,1987,Self-Organization of Associative Database and ...,An efficient method of self-organizing associa...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,A single cell theory for the development of se...


In [19]:
#Fetch wordcount for each abstract
dataset['word_count'] = dataset['abstract'].apply(lambda x: len(str(x).split(" ")))
dataset[['abstract','word_count']].head()

Unnamed: 0,abstract,word_count
0,An efficient method of self-organizing associa...,73
1,A single cell theory for the development of se...,91


In [20]:
##Descriptive statistics of word counts
dataset.word_count.describe()

count     2.000000
mean     82.000000
std      12.727922
min      73.000000
25%      77.500000
50%      82.000000
75%      86.500000
max      91.000000
Name: word_count, dtype: float64

In [22]:
#Identify common words
freq = pandas.Series(' '.join(dataset['abstract']).split()).value_counts()[:20]
freq

of           10
the           7
to            6
and           5
a             5
an            4
that          3
In            3
is            3
in            3
databases     2
has           2
this          2
theory        2
we            2
neural        2
with          2
network       2
visual        2
been          2
dtype: int64

In [24]:
#Identify uncommon words
unFreq =  pandas.Series(' '.join(dataset 
         ['abstract']).split()).value_counts()[-20:]
unFreq

architectural    1
algorithm        1
for              1
application      1
quantitative,    1
system           1
reduction        1
systems.         1
IV               1
Cooper           1
manner           1
output.          1
Munrol.          1
An               1
applicable       1
demonstrated.    1
proposed.        1
any              1
applicability    1
handwritten      1
dtype: int64

In [28]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()
stem = PorterStemmer()

word = "inversely"

print("stemming:",stem.stem(word))
print("lemmatization:", lem.lemmatize(word, "v"))

stemming: invers
lemmatization: inversely


In [29]:
#Libraries for text preprocessing
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

In [31]:
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))

##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

In [34]:
corpus = []
for i in range(len(dataset)):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', dataset['abstract'][i])
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [36]:
#View corpus item
corpus[1]

'single cell theory development selectivity ocular dominance visual cortex presented bienenstock cooper munrol extended network applicable layer visual cortex paper present mean field approximation capture fairly transparent manner qualitative many quantitative result network theory finally consider application theory artificial neural network significant reduction architectural complexity possible'

In [56]:
#Word cloud
from os import path
import pillow
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
% matplotlib inlinewordcloud = WordCloud(background_color='white', stopwords=stop_words, max_words=100, max_font_size=50, random_state=42).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)

ImportError: No module named 'pillow'