### Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("tweets.csv", usecols=['text', 'id'])

In [3]:
df.head()

Unnamed: 0,id,text
0,784609194234306560,Here is my statement.pic.twitter.com/WAZiGoQqMQ
1,785608815962099712,Is this really America? Terrible!pic.twitter.c...
2,784840992734064640,The media and establishment want me out of the...
3,784767399442653184,Certainly has been an interesting 24 hours!
4,785561269571026944,Debate polls look great - thank you!\n#MAGA #A...


### Create model

In [4]:
from gensim.models import FastText
import gensim
import preprocessor as p

In [5]:
texts = []
for text in df.text.unique():
    texts.append(text)

df_text = pd.DataFrame(texts)

In [6]:
fastTextModel = FastText(df_text[0].to_list(),
                         size = 150,
                         workers=-1)

### Preprocess

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
def get_fast_text_vector(sentence):    
        return fastTextModel[sentence]

### Vectorized

In [10]:
def get_vectorized_sentence(sentence):
    sentence_clean = p.clean(sentence)
    sentence_vector = []
    for word in word_tokenize(sentence_clean):
        sentence_vector.append(get_fast_text_vector(word))
    return np.mean(sentence_vector, axis=0)

In [None]:
df['vectorized_sentence'] = df['text'].apply(get_vectorized_sentence)

### Distance

In [9]:
from sklearn.metrics.pairwise import euclidean_distances

In [11]:
def least_distance(predicted, context_sentence_vector):
    '''least_distance_text = ""
    least_distance = 100'''
    
    total_distance = []
    for index, rows in context_sentence_vector.iterrows():
        current_distance = euclidean_distances([predicted], [rows['vectorized_sentence']])
        total_distance.append({
            'id': rows['id'],
            'text': rows['text'],
            'score': current_distance
        })
        '''if current_distance<least_distance:
            least_distance_text = rows[['id', 'text']]
            least_distance = current_distance'''
    total_distance.sort(key=lambda x: x.get('score'))
    return total_distance[:20]
    #return total_distance.sort(key=get_score)

In [None]:
least_distance(get_vectorized_sentence("Here is my statement.pic.twitter.com/WAZiGoQqMQ"), df)

### Saved

#### Dataframe

In [15]:
df[['id', 'text']].to_csv('tweet_vectorized.csv', index=False)

In [None]:
test = df['vectorized_sentence'].to_numpy()

#### Numpy 2D array

In [None]:
np.save('vectorized_sentence.npy', test)

In [12]:
test2 = np.load('vectorized_sentence.npy', allow_pickle=True)

#### Save Fast text

In [18]:
def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f: 
        pickle.dump(data, f)

In [22]:
import bz2
import pickle

In [23]:
compressed_pickle('test', fastTextModel)

#### Load Fast Text

In [None]:
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [None]:
data = decompress_pickle('test.pbz2') 