In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_punctuation, stem_text

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

import spacy

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rishika\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rishika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rishika\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
words = spacy.load('en_core_web_sm')
dir(words)

['Defaults',
 '_AnyContext',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_components',
 '_config',
 '_disabled',
 '_factory_meta',
 '_get_pipe_index',
 '_link_components',
 '_meta',
 '_multiprocessing_pipe',
 '_optimizer',
 '_path',
 '_pipe_configs',
 '_pipe_meta',
 'add_pipe',
 'analyze_pipes',
 'batch_size',
 'begin_training',
 'component',
 'component_names',
 'components',
 'config',
 'create_optimizer',
 'create_pipe',
 'create_pipe_from_source',
 'default_config',
 'default_error_handler',
 'disable_pipe',
 'disable_pipes',
 'disabled',
 'enable_pipe',
 'evaluate',
 'factories',
 'factory',
 'factory_names',
 'from_bytes',
 '

In [3]:
wordlist = words.vocab
print (len(wordlist))

761


# Working on raw dataset

In [4]:
datasetin = pd.read_csv("./Smaller_blog_texts.csv", encoding="latin1")
df1 = datasetin['text']
df1.head()

0        Info has been found (+/- 100 pages, and 4....
1        These are the team members:   Drewes van d...
2                            testing!!!  testing!!!   
3    Thanks to Yahoo!'s Toolbar I can now 'capture'...
4    I had an interesting conversation with my Dad ...
Name: text, dtype: object

In [5]:
df1.shape

(19905,)

In [6]:
type(df1)

pandas.core.series.Series

In [7]:
#Creating another dataframe/series to add to the existing dataset to increase its size

datasetin1 = pd.read_csv('blogtext.csv', encoding='latin1')
datasetin1 = datasetin1.loc[20000:25000]

In [8]:
df2 = datasetin1['text']
df2.head()

20000                     Well I'm finally back from ca...
20001                     I'm gonna be gone for the wee...
20002                     I am finally done with my cos...
20003                     Many of you may have seen tha...
20004                     Well my Blog has had a face l...
Name: text, dtype: object

In [9]:
df = pd.DataFrame(pd.concat([df1, df2]))
df

Unnamed: 0,text
0,"Info has been found (+/- 100 pages, and 4...."
1,These are the team members: Drewes van d...
2,testing!!! testing!!!
3,Thanks to Yahoo!'s Toolbar I can now 'capture'...
4,I had an interesting conversation with my Dad ...
...,...
24996,1) I need a job to pay the bills. 2) I...
24997,Short post today. I think my calling w...
24998,What drivel. What drivel. I've gotten...
24999,"After some extensive searching, I found..."


In [10]:
duplicate = df[df.duplicated()]
duplicate

Unnamed: 0,text
761,urlLink resume
763,urlLink resume help
2309,hey guys - i had the flu today - threw up ...
3728,Around 1:00 my mom picked me up from work ...
3729,Around 1:00 my mom picked me up from work ...
...,...
21702,
22792,....then congratulations - ho...
22794,'The scholar is not one who s...
22868,urlLink


In [11]:
df.drop_duplicates(keep='first', inplace=True)
df.shape

(24721, 1)

In [12]:
#Checking for any more duplicates (returns an empty series)
duplicate1 = df[df.duplicated()]
duplicate1

Unnamed: 0,text


In [13]:
#Changing the index to normal
df.reset_index(inplace=True)
df

Unnamed: 0,index,text
0,0,"Info has been found (+/- 100 pages, and 4...."
1,1,These are the team members: Drewes van d...
2,2,testing!!! testing!!!
3,3,Thanks to Yahoo!'s Toolbar I can now 'capture'...
4,4,I had an interesting conversation with my Dad ...
...,...,...
24716,24996,1) I need a job to pay the bills. 2) I...
24717,24997,Short post today. I think my calling w...
24718,24998,What drivel. What drivel. I've gotten...
24719,24999,"After some extensive searching, I found..."


In [14]:
df.to_csv('Smaller_blog_texts2.csv', index=False)
df = pd.DataFrame(df)
df

Unnamed: 0,index,text
0,0,"Info has been found (+/- 100 pages, and 4...."
1,1,These are the team members: Drewes van d...
2,2,testing!!! testing!!!
3,3,Thanks to Yahoo!'s Toolbar I can now 'capture'...
4,4,I had an interesting conversation with my Dad ...
...,...,...
24716,24996,1) I need a job to pay the bills. 2) I...
24717,24997,Short post today. I think my calling w...
24718,24998,What drivel. What drivel. I've gotten...
24719,24999,"After some extensive searching, I found..."


# Cleaning previously processed dataset

In [15]:
#Found a list of words online, going to use it to remove all useless words
words = open('Words.txt')
word = []
for line in words:
    word.append(line.strip())
len(word)

113832

In [16]:
word

['aa',
 'aah',
 'aahed',
 'aahing',
 'aahs',
 'aal',
 'aalii',
 'aaliis',
 'aals',
 'aardvark',
 'aardvarks',
 'aardwolf',
 'aardwolves',
 'aas',
 'aasvogel',
 'aasvogels',
 'aba',
 'abaca',
 'abacas',
 'abaci',
 'aback',
 'abacus',
 'abacuses',
 'abaft',
 'abaka',
 'abakas',
 'abalone',
 'abalones',
 'abamp',
 'abampere',
 'abamperes',
 'abamps',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abandonments',
 'abandons',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasement',
 'abasements',
 'abaser',
 'abasers',
 'abases',
 'abash',
 'abashed',
 'abashes',
 'abashing',
 'abasing',
 'abatable',
 'abate',
 'abated',
 'abatement',
 'abatements',
 'abater',
 'abaters',
 'abates',
 'abating',
 'abatis',
 'abatises',
 'abator',
 'abators',
 'abattis',
 'abattises',
 'abattoir',
 'abattoirs',
 'abaxial',
 'abaxile',
 'abbacies',
 'abbacy',
 'abbatial',
 'abbe',
 'abbes',
 'abbess',
 'abbesses',
 'abbey',
 'abbeys',
 'abbot',
 'abbotcies',
 'abbotcy',
 'abbots',
 'abbreviate',
 '

In [17]:
#Created a list of custom stopwords from the one I made here as it wasn't helping directly

stops = open('Stopwords.txt')
stop = []

for line in stops:
    stop.append(line.strip())

len(stop)

349

In [18]:
stop

['a',
 'about',
 'above',
 'absolutely',
 'actually',
 'after',
 'again',
 'against',
 'ah',
 'ain',
 'aint',
 "ain't",
 'all',
 'also',
 'although',
 'always',
 'am',
 'an',
 'and',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anyway',
 'anywho',
 'are',
 'aren',
 "aren't",
 'around',
 'as',
 'at',
 'away',
 'b',
 'back',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'c',
 'can',
 'certainly',
 'come',
 'could',
 'coulda',
 'couldn',
 "couldn't",
 'd',
 'day',
 'definitely',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'e',
 'each',
 'eh',
 'else',
 'even',
 'ever',
 'every',
 'f',
 'feel',
 'few',
 'for',
 'from',
 'further',
 'g',
 'get',
 'give',
 'go',
 'goes',
 'going',
 'gon',
 "gon'",
 'gonna',
 'got',
 'gotta',
 'h',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'hello',
 '

In [19]:
#Trying a different approach to stopword-removal by creating a new wordlist containing all valid words except stopwords

print (len(word), len(stop))
print (len(word) - len(stop))

for i in word:
    a = i.strip()
    if a in stop:
        word.remove(i)
print (len(word))

#(len(word)) > (len(word) - len(stop)) because stop contains some words not in word

113832 349
113483
113596


In [20]:
#Checking for any stopwords still present in word
"then" in word

False

In [21]:
df[['text']] = df[['text']].astype(str)
df

Unnamed: 0,index,text
0,0,"Info has been found (+/- 100 pages, and 4...."
1,1,These are the team members: Drewes van d...
2,2,testing!!! testing!!!
3,3,Thanks to Yahoo!'s Toolbar I can now 'capture'...
4,4,I had an interesting conversation with my Dad ...
...,...,...
24716,24996,1) I need a job to pay the bills. 2) I...
24717,24997,Short post today. I think my calling w...
24718,24998,What drivel. What drivel. I've gotten...
24719,24999,"After some extensive searching, I found..."


In [22]:
clean_text = []

def preprocess(text):   
    #New stopwords function, tokenize and lemmatize (remove punctuation before tokenization)
    text = text.lower()
    text = strip_punctuation(text) 
    text = strip_numeric(text)
    text = word_tokenize(text)
    text = [w for w in text if w in word]    
    #text = [a for a in text if a not in custom_stopwords]    #This removed useless words but not the stopwords from the df
    text = " ".join(text)
    text = stem_text(text)    #porter.stem() didn't work properly
    
    #text = rem_contractions(text)
    #text = rem_slang(text)    #Some problems with these 2 lines
    #text = word_tokenize(text)
    #text = [porter.stem(words) for words in text if words not in custom_stopwords]
    #text = remove_stopwords(text)    #rem_stopwords() didn't work properly

    clean_text.append(text)
    return text

egtext = "Hello, aaaah then this is 1 trial of the above function oh hrfjnfjm!"

print (preprocess(egtext)) #TRIAL SUCCESSFUL (but stopwords retained - FIXED)

# Turns out, the stopwords were being removed, but they were being detected as substrings (like "the" in "theme")
# For this, the only thing is to keep making the list of stopwords more nd more inclusive

trial abov function


In [23]:
df = df['text']

In [24]:
df.shape
df[3]

"Thanks to Yahoo!'s Toolbar I can now 'capture' the URLs of popups...which means now I can show you some cool links to Korean Pop (K-Pop) audio and video without the need to relate instructions like: 'go to the site, click on the pop-audio button then choose...'.  So, without further ado here is the link to 24-hour K-Pop  urlLink audio  and the  urlLink video  streaming.  Enjoy.  "

In [25]:
(i, ) = df.shape
print (i)

cleantext = []

for x in range(i):
    cleantext.append(preprocess(df[x]))    #WORKS BUT TAKES VERY LONG TO RUN (Around 2 hrs)
cleantext[3]

24721


'thank yahoo captur mean show cool link pop pop audio video relat instruct site click pop audio button choos ado link hour pop audio video stream enjoi'

In [26]:
cleantext

['info found page file wait team leader process learn',
 'team member van mail mail mail',
 'test test',
 'thank yahoo captur mean show cool link pop pop audio video relat instruct site click pop audio button choos ado link hour pop audio video stream enjoi',
 'interest convers dad morn talk put monei invari real estat cash cash includ short term invest year save account reason real estat monei monei seen survei real estat rise per year long stretch take account crisi refer crisi bail korea compar corpor bond fell modestli recov local stock market repres version dow index gone appreci abov high point point link real estat sens convers note real big elit real estat investor billion convert properti dad seem flabbergast heck million dollar retir risk real estat south asset exampl north toot horn louder move countri worth cent crisi drop vi vi bad invest fall victim scam latest good morn citi project toast ladi lost everyth comment tear rich person beggar saber rattl north korea weak exch

In [27]:
print (type(cleantext))
ct = pd.DataFrame(cleantext)
ct    #CLEAN DATAFRAME

<class 'list'>


Unnamed: 0,0
0,info found page file wait team leader process ...
1,team member van mail mail mail
2,test test
3,thank yahoo captur mean show cool link pop pop...
4,interest convers dad morn talk put monei invar...
...,...
24716,job pai bill job suck life job interest big pl...
24717,short post todai call combin music engin man e...
24718,drivel drivel gotten anoth anoth real tickl he...
24719,extens search found real deal name car servic ...


In [28]:
ct.loc[ct[0].str.contains(" then ")]    #USELESS WORDS GONE (but not stopwords - now gone)

Unnamed: 0,0


In [29]:
x = (ct.iloc[26, :]).values
x

array(['long thought head last coupl dai least read mathematician plai stock market cool book wish read embark design exam wrote requir exam alreadi drink last night night school night want break find most effect most fun wai place call sausag near hong univers usual refer upon order beer notic interest beer came special pitcher dry ic compart bottom passag allow vapor flow top escap pretti cool idea seen anoth bar anoth teen bar district near town start odd someon bar meet street like pretti low rememb last night came call old worker basic block live octopu place almost interest rememb friend mine former korea sister law friend engag girl often come specif friend thousand peopl friend mani degre separ two friend count plu three groom two friend alreadi know sister law two friend degre bad suppos peopl street wife friend school older kid quit interest stori anoth area yesterdai met gui asset manag firm busi spoke quit good english found note brother law live owner ginseng chicken soup 

In [30]:
ct.to_csv('Blog_Texts_Cleaned2.csv', index=False)

In [31]:
STOP HERE

SyntaxError: invalid syntax (<ipython-input-31-a6774c8535dd>, line 1)

# Code I tested for cleaning but didn't use

df['clean_text'] = df['text'].apply(lambda x: preprocess(x))

i, j = df.shape

for x in range(i):
    df.loc[x, 'clean_text'] = preprocess(df['text'][x])

df[['clean_text']] = df[['clean_text']].astype(str)    
df

df.loc[df['clean_text'].str.contains("aaaaaaaaaaah")]

useless = ['aaaaaaaaaaaah', 'aaaaaaaaaaah', 'aaaaaaaaaaahhhhhhhhhhhhhhhhhhh', 'aaaaaaaaaahhhhhhhhhh', 'aaaaaaahhhhh', 
           'aaaaaaaaaarrrrrrrrrrrrrggggggggggggggghhhhhhhhhh', 'aaaaaaaaah', 'aaaaaaaaahhhhh', 'aaaaaaaaahhhhhhh', 
           'aaaaaaaaahhhhhhhh', 'aaaaaaaaahhhhhhhhhhhhhhh', 'aaaaaaaahhhhh', 'aaaaaaaahhhhhhhhhhhhhhh', 'aaaaaaah', 
           'aaaaaaahhhhhh', 'aaaaaaahhhhhhhhhh', 'aaaaaaahhhhhhhhhhh', 'aaaaaah', 'aaaaaahhhhh', 'aaaaaahhhhhhhh', 
           'aaaaaarrrrrrgggggggghhhhhhhhh', 'aaaaaawwwwww', 'aaaaack', 'aaaaah', 'aaaaahhhh', 'aaaaahhhhh', 'aaaaahhhhhh', 
           'aaaaahhhhhhh', 'aaaaaleluia', 'aaaaand', 'aaaaano', 'aaaaanyway', 'aaaaargh', 'aaaaasmith', 'aaaack',
           'aaaaggghhhhh', 'aaaah', 'aaaahh', 'aaaarrrgghhhh', 'aaaarrrrrrgggghhhhhh', 'aaack', 'aaagh', 'aaah', 'aaahhh', 
           'aaahhhh', 'aaahing','aaargh', 'aaarrggghhh', 'aaarrrggghhhhhhhhgggghhhhhh', 'aaarrrrggggghhhhh', 'aaaw','aagrh',
           'aah', 'aahed', 'aahh', 'aahs', 'aaja', 'aakalain', 'aal', 'aalaga', 'aalangang', 'aaldering', 'aalis', 'aamco', 
           'aanim', 'aao', 'aap', 'aar', 'aaral', 'aargh', 'aarp', 'aarrgghh', 'aarseth', 'aaugh', 'aauuugh', 'aave','aay', 
           'aayos', 'ab','aba', 'abab', 'abad','aawwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww', 'aaaaaahhh']

def rem_ul(text):
    T = text.split()
    T  = [word for word in T if word.lower() not in useless]
    text = ' '.join(T)
    return text

print (rem_ul("aargh How are you?"))

#If this doesn't work, I'll just add such words to the list of custom stopwords
#It did work, but now I'm sticking to the word list I've added into the program

text = "Hello, there! Are you fine?"
for text in df['text']:
    text = rem_ul(text)

df

test = df3['text'].loc[0:3]
test

for i in pd.read_csv('blogtext.csv', chunksize=100):
    print (i)
    print()

dataset1 = pd.read_csv("Smaller_blog_texts2.csv", encoding='latin1')
dataset1.head()

df = dataset1['text']
df

type(df)

df = df.to_frame()
df

df.isnull().values.any() 
df.dropna(inplace=True)

In [None]:
df

In [None]:
word.count("the")

# Preprocessing the dataset (smaller blog texts 1) for doc2vec

In [None]:
import pprint
from collections import defaultdict
from gensim import corpora

In [None]:
dx = pd.read_csv('Smaller_blog_texts1.csv')
dx.head()

In [None]:
dx[['text']] = dx[['text']].astype(str)

In [None]:
clean_text = []
def prep(text):   
    #New stopwords function, tokenize and lemmatize (remove punctuation before tokenization)
    text = text.lower()
    text = sent_tokenize(text)  
    text = [w for w in text if w in word]
    clean_text.append(text)
    return text

dx['clean_text'] = dx['text'].apply(lambda x: prep(x))
dx.head()

In [None]:
dx.iloc[24683, 1]

In [None]:
dx.loc[dx["text"].str.contains(" then")]

In [2]:
print ("HI")

HI
