In [5]:
import pandas as pd
import numpy as np
import re
import string
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pickle
#import preprocessor as p
#from sklearn.metrics.pairwise import euclidean_distances
#from nltk.stem import WordNetLemmatizer
#from nltk.tokenize import word_tokenize

In [6]:
df = pd.read_csv("tweets.csv", usecols=['text', 'id'])
df.head()

Unnamed: 0,id,text
0,784609194234306560,Here is my statement.pic.twitter.com/WAZiGoQqMQ
1,785608815962099712,Is this really America? Terrible!pic.twitter.c...
2,784840992734064640,The media and establishment want me out of the...
3,784767399442653184,Certainly has been an interesting 24 hours!
4,785561269571026944,Debate polls look great - thank you!\n#MAGA #A...


In [7]:
def preprocess_tweet(text):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    # convert text to lower-case
    nopunc = nopunc.lower()
    # remove pic 
    nopunc = re.sub(r'pic.twitter.com/[\w]*',"", nopunc)
    # remove URLs
    nopunc = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', nopunc)
    nopunc = re.sub(r'http\S+', '', nopunc)
    # remove special caracters
    nopunc = re.sub(r'\W+', ' ', nopunc)
    # remove usernames
    nopunc = re.sub('@[^\s]+', '', nopunc)
    # remove the # in #hashtag
    nopunc = re.sub(r'#([^\s]+)', r'\1', nopunc)
    #remove punctuation
    nopunc = re.sub(r'[^\w\s]','',nopunc)
    # remove words with only one character
    nopunc =  re.sub(r"\b[a-zA-Z]\b", "", nopunc)
    # remove repeated characters
    nopunc = word_tokenize(nopunc)
    # remove stopwords from final word list
    return [word for word in nopunc if word not in stopwords.words('english')]

In [8]:
df_text = df.copy()
df_text["clean_text"] = df_text["text"].apply(lambda x: preprocess_tweet([x]))
df_text.head()

Unnamed: 0,id,text,clean_text
0,784609194234306560,Here is my statement.pic.twitter.com/WAZiGoQqMQ,[statement]
1,785608815962099712,Is this really America? Terrible!pic.twitter.c...,"[really, america, terrible]"
2,784840992734064640,The media and establishment want me out of the...,"[media, establishment, want, race, badly, neve..."
3,784767399442653184,Certainly has been an interesting 24 hours!,"[certainly, interesting, 24, hours]"
4,785561269571026944,Debate polls look great - thank you!\n#MAGA #A...,"[debate, polls, look, great, thank, maga, amer..."


In [24]:
def clean_words(df):
    df_text = df.copy()
    df_text["clean_text"] = df_text["text"].apply(lambda x: preprocess_tweet([x]))
    return df_text

In [23]:
def tag_words(df):
    tag_words = [TaggedDocument(d, [i]) for i, d in enumerate(df.clean_text)]
    return tag_words

In [9]:
tag_words = [TaggedDocument(d, [i]) for i, d in enumerate(df_text.clean_text)]
tag_words

[TaggedDocument(words=['statement'], tags=[0]),
 TaggedDocument(words=['really', 'america', 'terrible'], tags=[1]),
 TaggedDocument(words=['media', 'establishment', 'want', 'race', 'badly', 'never', 'drop', 'race', 'never', 'let', 'supporters', 'maga'], tags=[2]),
 TaggedDocument(words=['certainly', 'interesting', '24', 'hours'], tags=[3]),
 TaggedDocument(words=['debate', 'polls', 'look', 'great', 'thank', 'maga', 'americafirst'], tags=[4]),
 TaggedDocument(words=['saying', 'clinton', 'campaign', 'anti', 'catholic', 'bigotry'], tags=[5]),
 TaggedDocument(words=['thank', 'maga', 'americafirst'], tags=[6]),
 TaggedDocument(words=['cincinnati', 'ohio', 'tomorrow', 'night', '7', '30pm', 'join', 'ohiovotesearly', 'votetrumppence16', 'tickets'], tags=[7]),
 TaggedDocument(words=['little', 'pick', 'dishonest', 'media', 'incredible', 'information', 'provided', 'wikileaks', 'dishonest', 'rigged', 'system'], tags=[8]),
 TaggedDocument(words=['thank', 'florida', 'movement', 'never', 'seen', 'nev

In [10]:
modeldoc2vec = Doc2Vec(tag_words, vector_size=20, window=5, min_count=1, workers=-1, epochs = 20)

In [11]:
len(modeldoc2vec.wv.vocab)

17524

In [12]:
pickle.dump(modeldoc2vec, open('Doc2VecModelFinal.pkl','wb'))

In [13]:
Doc2VecModelFinal = pickle.load(open('Doc2VecModelFinal.pkl','rb'))

In [14]:
len(Doc2VecModelFinal.wv.vocab)

17524

In [15]:
txt_example = ["Here is my statement"]
example_result_model = Doc2VecModelFinal.docvecs.most_similar(positive=[Doc2VecModelFinal.infer_vector(txt_example)], topn=20)
example_result_model

[(4557, 0.7610223293304443),
 (2548, 0.6945040225982666),
 (11531, 0.6928262710571289),
 (14732, 0.6885021924972534),
 (1086, 0.6813532114028931),
 (9943, 0.6791173815727234),
 (7254, 0.6784337162971497),
 (746, 0.6766431331634521),
 (4638, 0.6680054664611816),
 (3816, 0.6561073064804077),
 (2787, 0.6557133793830872),
 (6053, 0.6498664617538452),
 (14802, 0.6451649069786072),
 (2774, 0.6428327560424805),
 (1242, 0.642427384853363),
 (5237, 0.6380745768547058),
 (14322, 0.6366419196128845),
 (12037, 0.6304845213890076),
 (15671, 0.6246137022972107),
 (5926, 0.6220338940620422)]

In [21]:
example_of_string = "Is this really America"

In [20]:
def get_similarities(model, sentence):
    example_result_model = model.docvecs.most_similar(positive=[model.infer_vector([sentence])], topn=20)
    return example_result_model

In [22]:
get_similarities(Doc2VecModelFinal,example_of_string)

[(11095, 0.7411857843399048),
 (6285, 0.7329152822494507),
 (3625, 0.7183679342269897),
 (12613, 0.7090334296226501),
 (16201, 0.7071962356567383),
 (5221, 0.6938114166259766),
 (12648, 0.6913438439369202),
 (6150, 0.6909806728363037),
 (5558, 0.6654672622680664),
 (1633, 0.6626005172729492),
 (9702, 0.661744236946106),
 (475, 0.6605814695358276),
 (9217, 0.6595841646194458),
 (13942, 0.6578094363212585),
 (14652, 0.6563979387283325),
 (5029, 0.6511921286582947),
 (3993, 0.6398176550865173),
 (1749, 0.637171745300293),
 (1205, 0.6357545256614685),
 (15638, 0.6327559947967529)]

In [18]:
def get_tweet(resultsimilarity):
    tweet_similar = []
    for elem in resultsimilarity:
        tweet_similar.append({'id': elem[0],'tweets': df.text[elem[0]],'score': elem[1]})
    #print(tweet_similar)
    return tweet_similar

In [19]:
get_tweet(example_result_model)

[{'id': 4557,
  'tweets': '"@Boarcane: DonaldTrump We need you to run and WIN! A true patriot with only the country\'s best interest at heart. http://twitter.com/Boarcane/status/578358741441052673/photo/1pic.twitter.com/6LIcHFMx2G\xa0',
  'score': 0.7610223293304443},
 {'id': 2548,
  'tweets': 'Ben Carson was speaking in general terms as to what he would do if confronted with a gunman, and was not criticizing the victims. Not fair!',
  'score': 0.6945040225982666},
 {'id': 11531,
  'tweets': '.@DennisRodman re @Omarosa is right, she’s becoming predictable.',
  'score': 0.6928262710571289},
 {'id': 14732,
  'tweets': 'A friend is one who has the same enemies as you have. -- Abraham Lincoln',
  'score': 0.6885021924972534},
 {'id': 1086,
  'tweets': 'MAKE AMERICA GREAT AGAIN!',
  'score': 0.6813532114028931},
 {'id': 9943,
  'tweets': "The ties, shirts and suits at Macy's are doing fantastically well-check out the new designs and low prices-nothing better!",
  'score': 0.6791173815727234

In [27]:
def get_tweet_only(resultsimilarity):
    only_tweet_similar = []
    for elem in resultsimilarity:
        only_tweet_similar.append({'tweets':df.text[elem[0]]})
    #print(tweet_similar)
    return only_tweet_similar

In [28]:
get_tweet_only(example_result_model)

[{'tweets': '"@Boarcane: DonaldTrump We need you to run and WIN! A true patriot with only the country\'s best interest at heart. http://twitter.com/Boarcane/status/578358741441052673/photo/1pic.twitter.com/6LIcHFMx2G\xa0'},
 {'tweets': 'Ben Carson was speaking in general terms as to what he would do if confronted with a gunman, and was not criticizing the victims. Not fair!'},
 {'tweets': '.@DennisRodman re @Omarosa is right, she’s becoming predictable.'},
 {'tweets': 'A friend is one who has the same enemies as you have. -- Abraham Lincoln'},
 {'tweets': 'MAKE AMERICA GREAT AGAIN!'},
 {'tweets': "The ties, shirts and suits at Macy's are doing fantastically well-check out the new designs and low prices-nothing better!"},
 {'tweets': '"@TedatACA: I concur with @realDonaldTrump on Pinehurst. IMO: Those blasts from sandy areas looked like IEDs exploding In Kandahar Province.'},
 {'tweets': 'It is only the people that were never asked to be VP that tell the press that they will not take th