In [1]:
##### Motive of this notebook is to prepare data in order to modify Gensim based paragraph2vec and Document2vec Model

In [None]:
import pandas as pd 
import re 
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from nltk.tokenize import word_tokenize

In [2]:
#Using public tweets 
F = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin-1') 

In [3]:
FilteredData_Main = F[F.columns[4:6]]   #Filtering out useful columns from the datasheet
FilteredData_Main.columns = ["UserName","Tweet"]
FilteredData_Main = FilteredData_Main[FilteredData_Main['UserName'].map(FilteredData_Main['UserName'].value_counts()) > 100]

In [7]:
##creating corpus to train gensim model for a user doc approach, 
##Since User Doc is a individual tweet based approach we limit the users on basis of tweets
CorpusForUserDoc = F[F.columns[4:6]]        
CorpusForUserDoc.columns = ["UserName","Tweet"]
CorpusForUserDoc = CorpusForUserDoc[CorpusForUserDoc['UserName'].map(CorpusForUserDoc['UserName'].value_counts()) < 20]
CorpusForUserDoc = CorpusForUserDoc[CorpusForUserDoc['UserName'].map(CorpusForUserDoc['UserName'].value_counts()) > 10]

In [8]:
userDocCorp_df = pd.DataFrame({'author' : CorpusForUserDoc['UserName'].unique()})
userDocCorp_df['alltweets'] = [list(set(CorpusForUserDoc['Tweet'].loc[CorpusForUserDoc['UserName'] == x['author']])) for _,x in userDocCorp_df.iterrows()]

In [7]:

def gethashtags(tweetlist):
    l = []
    for tweet in tweetlist:
        l.extend(re.findall(r"#(\w+)",str(tweet)))
    return list(set(l))
def getmentionedUsers(tweetlist):
    l = []
    for tweet in tweetlist:
        l.extend(re.findall(r"@(\w+)",str(tweet)))
    return list(set(l))

In [8]:
def preprocessRemoveURL(tweetlist):
    l = []
    for tweet in tweetlist:    
        clean_tweet = re.match('(.*?)http.*?\s?(.*?)',str(tweet))
        if(clean_tweet):
            l.append(clean_tweet.group(1))
        else:
            l.append(tweet)
        #l.append(re.sub(r"http:\+","",str(tweet)))
    return l

def removeUserMentioned(tweetlist):
    l = []
    for tweet in tweetlist:
        l.append(re.sub('@[^\s]+','',str(tweet)))
    return l

In [17]:
#preprocessing userCrop data
userDocCorp_df['hashtags'] = [gethashtags(x['alltweets']) for _,x in userDocCorp_df.iterrows()]
userDocCorp_df['selectedtweets'] = [preprocessRemoveURL(x['alltweets']) for _,x in userDocCorp_df.iterrows()]
userDocCorp_df['MentionedUsers'] = [getmentionedUsers(x['alltweets']) for _,x in userDocCorp_df.iterrows()]
userDocCorp_df['selectedtweets'] = [removeUserMentioned(x['selectedtweets']) for _,x in userDocCorp_df.iterrows()]

In [18]:
# Filtering and modifying data to modify Post-Doc approach, using Gensim Doc2vec Model
corpus_totrainmodel = []
for _,x in FilteredData_Main.iterrows():
    tweet = x["Tweet"]
    clean_tweet = re.match('(.*?)http.*?\s?(.*?)',str(tweet))
    if(clean_tweet):
        tweet = clean_tweet.group(1)
    else:
        pass
    tweet = re.sub('@[^\s]+','',str(tweet))
    corpus_totrainmodel.append(tweet)
corpus = list(set(corpus_totrainmodel))

In [19]:
tagged_data = [TaggedDocument(words = word_tokenize(_d.lower()), tags = [str(i)]) for i,_d in enumerate(corpus)]

In [21]:
max_epoch = 100
vec_size = 300  # also evaluated against 200,400 and 500,performed best with 300-dimension
alpha = 0.025
model = Doc2Vec(vector_size = vec_size, alpha = alpha,min_alpha = 0.00025,min_count = 1,dm = 1)

In [22]:
model.build_vocab(tagged_data)

In [23]:
for epoch in range(max_epoch):
    print('iteration{0}'.format(epoch))
    model.train(tagged_data, total_examples = model.corpus_count, epochs = model.epochs)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha
    
    if((epoch % 50) == 0):
        model.save("d2v_"+str(epoch)+"T.model")
        print("Model Saved")
model.save("d2vFinal.model")      #Saving the model for later usage 
print("Model Saved")

iteration0
Model Saved
iteration1
iteration2
iteration3
iteration4
iteration5
iteration6
iteration7
iteration8
iteration9
iteration10
iteration11
iteration12
iteration13
iteration14
iteration15
iteration16
iteration17
iteration18
iteration19
iteration20
iteration21
iteration22
iteration23
iteration24
iteration25
iteration26
iteration27
iteration28
iteration29
iteration30
iteration31
iteration32
iteration33
iteration34
iteration35
iteration36
iteration37
iteration38
iteration39
iteration40
iteration41
iteration42
iteration43
iteration44
iteration45
iteration46
iteration47
iteration48
iteration49
iteration50
Model Saved
iteration51
iteration52
iteration53
iteration54
iteration55
iteration56
iteration57
iteration58
iteration59
iteration60
iteration61
iteration62
iteration63
iteration64
iteration65
iteration66
iteration67
iteration68
iteration69
iteration70
iteration71
iteration72
iteration73
iteration74
iteration75
iteration76
iteration77
iteration78
iteration79
iteration80
iteration81
it

In [32]:
##Load Model for Post_doc method
model = Doc2Vec.load("d2vFinal.model")

In [34]:
test_example = ' '.join(user_tweet_df['selectedtweets'][1])

In [36]:
test_data = word_tokenize(test_example.lower())

In [37]:
test_vector = model.infer_vector(test_data)

In [25]:
# Filtering and modifying data to modify User-Doc Approach
corpus_totrainmodel = []
for _,x in userDocCorp_df.iterrows():
    tweet = x["selectedtweets"]
    clean_tweet = re.match('(.*?)http.*?\s?(.*?)',str(tweet))
    if(clean_tweet):
        tweet = clean_tweet.group(1)
    else:
        pass
    tweet = re.sub('@[^\s]+','',str(tweet))
    corpus_totrainmodel.append(tweet)
corpus = list(set(corpus_totrainmodel))

In [28]:
tagged_data = [TaggedDocument(words = word_tokenize(_d.lower()), tags = [str(i)]) for i,_d in enumerate(corpus)]

In [29]:
#User2Doc submodel description
#Motive is to generate a Matrix where each row consist of representation of user, This Matrix will be passed as a input to AutoEncoder to reduce dimension 
#For each user 20 tweets are considered and each tweets are passed through gensim model to generate vector of 50 dimensions each
#These 50 dimensional vectors are then concatenated to for a vector of 1000 dimension as a vector for each user. 
max_epoch = 100
vec_size = 50       
alpha = 0.025
model = Doc2Vec(vector_size = vec_size, alpha = alpha,min_alpha = 0.00025,min_count = 1,dm = 1)

In [32]:
model.build_vocab(tagged_data)

In [33]:
#training the tweet2vec model to generate 50 embedding vector for each tweets
for epoch in range(max_epoch):
    print('iteration{0}'.format(epoch))
    model.train(tagged_data, total_examples = model.corpus_count, epochs = model.epochs)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha
    
    if((epoch % 25) == 0):
        model.save("Tweet2Vec"+str(epoch)+".model")
        print("Model Saved for" + str(epoch) + " epochs")
model.save("Tweet2Vec_Final.model")
print("Final Model Saved")

iteration0
Model Saved for0 epochs
iteration1
iteration2
iteration3
iteration4
iteration5
iteration6
iteration7
iteration8
iteration9
iteration10
iteration11
iteration12
iteration13
iteration14
iteration15
iteration16
iteration17
iteration18
iteration19
iteration20
iteration21
iteration22
iteration23
iteration24
iteration25
Model Saved for25 epochs
iteration26
iteration27
iteration28
iteration29
iteration30
iteration31
iteration32
iteration33
iteration34
iteration35
iteration36
iteration37
iteration38
iteration39
iteration40
iteration41
iteration42
iteration43
iteration44
iteration45
iteration46
iteration47
iteration48
iteration49
iteration50
Model Saved for50 epochs
iteration51
iteration52
iteration53
iteration54
iteration55
iteration56
iteration57
iteration58
iteration59
iteration60
iteration61
iteration62
iteration63
iteration64
iteration65
iteration66
iteration67
iteration68
iteration69
iteration70
iteration71
iteration72
iteration73
iteration74
iteration75
Model Saved for75 epochs