In [1]:
# import modules & set up logging
from gensim.models import Word2Vec, FastText
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
import re
import stanza
stanza.download('en', processors='tokenize')
nlp = stanza.Pipeline('en', processors='tokenize')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json: 139kB [00:00, 3.59MB/s]                    
2021-07-17 23:36:15,425 : INFO : Downloading these customized packages for language: en (English)...
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-07-17 23:36:15,442 : INFO : File exists: /Users/slavkoz/stanza_resources/en/tokenize/combined.pt.
2021-07-17 23:36:15,470 : INFO : Finished downloading models and saved to /Users/slavkoz/stanza_resources.
2021-07-17 23:36:15,487 : INFO : Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-07-17 23:36:15,498 : INFO : Use device: cpu
2021-07-17 23:36:15,502 : INFO : Loading: tokenize
2021-07-17 23:36:15,522 : INFO : Done loading processors!


In [2]:
class OffensiveSentences(object):
    def __init__(self):
        pass
 
    def __iter__(self):
        def special_chars_or_punctuation_only(word):
            word = re.sub('[^a-zA-Z:]', '', word)          # Remove punctuations
            word = re.sub("(\\d|\\W)+","",word)            # remove special characters and digits
            return len(word) == 0

        for line in open(os.path.join('./full_textOnly_cleaned_dataset.csv'), encoding="utf-8"):
            line = re.sub('\n', '', line)
            #print(f"ORIGINAL: '{line}'")            
            line = line.lower()                             # Convert to lowercase
            line = re.sub(r'\s+',' ', line)                  # Remove duplicated whitespaces
            processed_line = [word.text for sentence in nlp(line).sentences for word in sentence.words if not special_chars_or_punctuation_only(word.text)]            
            #print(f"PROCESSED: '{processed_line}'")
            yield processed_line
 
sentences = OffensiveSentences() # a memory-friendly iterator

In [3]:
model = Word2Vec(sentences, min_count=2, vector_size=50, workers=4)

2021-07-17 23:36:15,807 : INFO : collecting all words and their counts


ORIGINAL: '"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"'


2021-07-17 23:36:16,308 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


PROCESSED: '['explanation', 'why', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted', 'they', 'were', "n't", 'vandalisms', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fac', 'and', 'please', 'do', "n't", 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', "i'm", 'retired', 'now']'
ORIGINAL: '"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)"'
PROCESSED: '['d', 'aww', 'he', 'matches', 'this', 'background', 'colour', "i'm", 'seemingly', 'stuck', 'with', 'thanks', 'talk', 'january', 'utc']'
ORIGINAL: '"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."'
PROCESSED: '['hey', 'man', "i'm", 'really', 'not', 'trying', 'to', 'edit', 'wa

KeyboardInterrupt: 

In [None]:
model.save('w2v_50dim_model_v2')
new_model = gensim.models.Word2Vec.load('w2v_50dim_model_v2')

In [None]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

In [None]:
model.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
model.similarity('woman', 'man')

In [None]:
model['computer']

In [None]:
model.wv.most_similar('computer', topn=10)

In [None]:
model = FastText(sentences, min_count=2, vector_size=50, workers=4)
model.save('fastText_50dim_model_v2')