In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

files = os.listdir("./data/Sentences/")
prefix_sentence = "./data/Sentences/"

clauses = []
for file in files:
    sentence_file_path = prefix_sentence + file
    sentences_df = pd.read_csv(sentence_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentences"]
    clauses.append(sentences_df["sentences"])

In [2]:
sentences = pd.Series()
for series in clauses:
    sentences = sentences.append(series)

In [3]:
import contractions
import unicodedata
def to_lower(data: pd.Series):
    return data.str.lower()

def remove_accented_characters(data: pd.Series):
    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))

def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"\d+;", " ", regex=True)

def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

def remove_html_and_url(data: pd.Series):
    data.str.replace(r"\d+;", " ", regex=True)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)
    return data

def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)
                     
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)

def fix_contractions(data: pd.Series):
    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])
    return data.apply(contraction_fixer)

def remove_special_words(data: pd.Series):
    return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)
                     
data_cleaning_pipeline = {
    "sentences": [
        to_lower,
        remove_special_words,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = sentences.copy()

for col, pipeline in data_cleaning_pipeline.items():
#     temp_data = cleaned_data[col].copy()
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        cleaned_data = func(cleaned_data)
        print(f"Ended: {func.__name__}")
#     cleaned_data[col] = temp_data.copy()

Starting: to_lower
Ended: to_lower
Starting: remove_special_words
Ended: remove_special_words
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [4]:
sentences

0      thanks for sending us good vibes by using the ...
1      you may be surprised , but we will refer to al...
2      the terms of use -lrb- or , the `` terms '' -r...
3      the language of the terms will seem legal -lrb...
4      when you use our services , in addition to enj...
                             ...                        
142    the failure of onavo to enforce any right or p...
143    the section headings in the agreement are incl...
144    `` including '' , whether capitalized or not ,...
145    this agreement may not be assigned by you with...
146                    last updated : december 20 , 2013
Length: 9414, dtype: object

In [5]:
sentences = sentences.tolist()

In [6]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in sentences:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [7]:
import gensim.models

sentences_corpus = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences_corpus)

In [8]:
import numpy as np
def get_embeddings():
    vectors = []
    for sentence in sentences:
        clause_vector = []
        for word in sentence.split(' '):
            try:
                clause_vector.append(model.wv[word])
            except KeyError:
                continue
        if len(clause_vector) > 0:
            vectors.append((sentence, np.mean(clause_vector, axis=0)))
    return vectors

In [9]:
embeddings = get_embeddings()

In [10]:
len(embeddings)

9407

In [11]:
embeddings[0]

('thanks for sending us good vibes by using the various services available within viber !',
 array([-0.02452753,  0.20171963,  0.4135883 , -0.02158867, -0.01773143,
        -0.70680076, -0.28980365,  0.44629934, -0.45942363, -0.32478878,
         0.04734469, -0.00172885, -0.20537488,  0.10438768,  0.28599688,
        -0.17200147,  0.405115  , -0.05832829,  0.18213576, -0.6927808 ,
         0.26119787,  0.02679167,  0.30342698, -0.54284126,  0.17974353,
         0.11607844, -0.29893813,  0.2861075 , -0.3202194 ,  0.1516255 ,
        -0.01933137,  0.01164661, -0.1968434 , -0.4447647 ,  0.02836283,
         0.06157237,  0.12053318, -0.37673917, -0.05979868, -0.48261645,
        -0.2769499 , -0.02490114, -0.13169497, -0.00436073,  0.47312808,
        -0.31674972, -0.14017837, -0.36911643,  0.31032014,  0.1369156 ,
         0.3360498 , -0.00653877,  0.0910335 ,  0.01460219, -0.0712401 ,
        -0.2305571 ,  0.16154325, -0.17911232, -0.25482813,  0.3073847 ,
        -0.10477858,  0.05301158

In [12]:
import pickle

with open('word2vec.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [13]:
with open('word2vec.pkl', 'rb') as f:
    training_data = pickle.load(f)

In [14]:
training_data

[('thanks for sending us good vibes by using the various services available within viber !',
  array([-0.02452753,  0.20171963,  0.4135883 , -0.02158867, -0.01773143,
         -0.70680076, -0.28980365,  0.44629934, -0.45942363, -0.32478878,
          0.04734469, -0.00172885, -0.20537488,  0.10438768,  0.28599688,
         -0.17200147,  0.405115  , -0.05832829,  0.18213576, -0.6927808 ,
          0.26119787,  0.02679167,  0.30342698, -0.54284126,  0.17974353,
          0.11607844, -0.29893813,  0.2861075 , -0.3202194 ,  0.1516255 ,
         -0.01933137,  0.01164661, -0.1968434 , -0.4447647 ,  0.02836283,
          0.06157237,  0.12053318, -0.37673917, -0.05979868, -0.48261645,
         -0.2769499 , -0.02490114, -0.13169497, -0.00436073,  0.47312808,
         -0.31674972, -0.14017837, -0.36911643,  0.31032014,  0.1369156 ,
          0.3360498 , -0.00653877,  0.0910335 ,  0.01460219, -0.0712401 ,
         -0.2305571 ,  0.16154325, -0.17911232, -0.25482813,  0.3073847 ,
         -0.1047785