In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import itertools

import re
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer 

from scipy.sparse import csr_matrix

from sklearn.decomposition import LatentDirichletAllocation as LDA

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
class Tweets_LDA():
    
    '''
    Class gets as input raw-data from twitter in csv format.
    There realized methods for data preprocessing and perfoming LDA on the preprocessed data.
    As output - words that are indicators of particular topic for tweets.
    '''
    
    def __init__(self,):
        self.path_to_file = ''
        self.tweets_frame = pd.DataFrame()
        
    def load_data(self,path_to_file):
        self.path_to_file = path_to_file
        self.tweets_frame = pd.read_csv(self.path_to_file)
        
        
    def data_preprocessing(self,date='',channels_not_to_consider=[]):
        '''
        date format example: 'Sat Jun 01 00:00:02'
        '''
        WPT = nltk.WordPunctTokenizer()
        lemmanizer = WordNetLemmatizer()
        stop_word_list = nltk.corpus.stopwords.words('english')
        
        if date!='':
            self.tweets_frame = self.tweets_frame[self.tweets_frame['created_at'].map(lambda x: x[:len(date)])==date]
        if len(channels_not_to_consider)>0:
            self.tweets_frame = self.tweets_frame[self.tweets_frame['screen_name'].isin(channels_not_to_consider)==False]
            self.tweets_frame = self.tweets_frame[self.tweets_frame['screen_name'].map(lambda x: x.startswith('tmj_'))==False]
        
        self.tweets_frame['tweets_processed'] =\
                self.tweets_frame['tweet'].map(lambda x: [lemmanizer.lemmatize(y) for y in re.sub("[\d+0-9.…#!'\"_?,;/:()’%*🤯“”&🧨$🧨🧡]", "", x.lower()).split() 
                                         if y not in stop_word_list 
                                         and y not in ['・・・','','-']
                                         and not y.startswith('http')
                                         and not y.startswith('@')])
        
        self.tweets_frame['tweets_processed'] =\
                self.tweets_frame['tweets_processed'].map(lambda x: [y for y in x if len(y)>2])
        
        self.tweets_frame = self.tweets_frame[self.tweets_frame['tweets_processed'].map(len)>0]
        
    def LDA(self,n_topics):
        
        def create_co_occurences_matrix(allowed_words, documents):
            word_to_id = dict(zip(allowed_words, range(len(allowed_words))))
            documents_as_ids = [np.sort([word_to_id[w] for w in doc if w in word_to_id]).astype('uint32') for doc in documents]
            row_ind, col_ind = zip(*itertools.chain(*[[(i, w) for w in doc] for i, doc in enumerate(documents_as_ids)]))
            data = np.ones(len(row_ind), dtype='uint32')  # use unsigned int for better memory utilization
            max_word_id = max(itertools.chain(*documents_as_ids)) + 1
            docs_words_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(documents_as_ids), max_word_id))  # efficient arithmetic operations with CSR * CSR
            words_cooc_matrix = docs_words_matrix.T * docs_words_matrix  # multiplying docs_words_matrix with its transpose matrix would generate the co-occurences matrix
            words_cooc_matrix.setdiag(0)
            return words_cooc_matrix, word_to_id 
        
        def print_topics(model, count_vectorizer, n_top_words,words):
            #words = cross_tab.columns
            for topic_idx, topic in enumerate(model.components_):
                print("\nTopic #%d:" % topic_idx)
                print(" ".join([words[i]+':'+str(round(topic[i]/topic.max(),4))
                                for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
        merged = list(itertools.chain.from_iterable(self.tweets_frame['tweets_processed'].values))
        merged_cnts = np.unique(merged,return_counts=True)
        words = merged_cnts[0][merged_cnts[1]>2]
        a, b = create_co_occurences_matrix(words,self.tweets_frame['tweets_processed'].values)
        lda_ = LDA(n_components=n_topics)
        lda_.fit(a)
        print_topics(lda_,' ',10,words)
        
    

In [20]:
# tweets_frame = pd.read_csv('../data/tweets/new_york_training_tweets_15_06.csv')

In [21]:
a = Tweets_LDA()

In [22]:
a.load_data('../data/tweets/new_york_training_tweets_15_06.csv')

In [23]:
len(a.tweets_frame)

332548

In [27]:
channels_not_to_consider = [#traffic
        '511NY', '511NYC', 'TotalTrafficNYC', '511nyNJ', '511NYMidHudson',
       'Reported_NYC', '511ny456', '511nyAlbany', '511nyACE', '511ny123', 
        '511nyBDFV', '511nyWNY', '511nyRochester',
       '511nyLongIsland', 'AllKindsWeather', 
       '511ny7',
    #jobs
    'tmj_nyc_jobs', 'tmj_nyc_adv', 'CalvaryHospJobs',
       'tmj_RAM_cstsrv', 'tmj_nyc_mgmt', 'tmj_roc_cler',
       'tmj_NAS_edu', 'tmj_nyc_it', 'tmj_NYC_schn', 'tmj_NJN_cstsrv',
       'tmj_nwk_retail', 'tmj_nyc_nursing', 'tmj_nyc_sales', 'tmj_nyc_cler',
       'USSJobs', 'tmj_nwk_sales', 'tmj_nyc_legal', 'tmj_nwk_eng',
       'tmj_nyc_retail', 'tmj_nyc_transp', 'WGPNursingJobs', 'tmj_nwk_socsci',
       'tmj_NYC_adm', 'nwkmeddevice', 'tmj_nyc_edu', 'tmj_nwk_secure',
       'nwknursing',
    'tmj_nyc_finance', 'tmj_NYC_secure', 'tmj_nyc_acct', 'tmj_nwk_cler',
       'tmj_nyc_banking', 'tmj_ny_hrta', 'tmj_nwk_mgmt', 'tmj_nwk_auto',
       'tmj_nyc_cstsrv', 'tmj_nyc_health', 'nwkhealth', 'tmj_nwk_cstsrv',
       'tmj_nyc_manuf', 'tmj_cte_nursing', 'tmj_nyc_eng', 'tmj_nyc_itpm1',
       'tmj_nyc_hr', 'tmj_NY_sales', 'CVSHealthJobs', 'tmj_NAS_mgmt',
       'CompassJobBoard', 'tmj_nyc_hrta', 'tmj_NAS_nursing', 'BostonMarketJob',
       'tmj_nyc_labor', 'MetsAvenue', 'tmj_nwk_schn', 'tmj_RAM_nursing',
       'tmj_nwk_acct', 'tmj_nwk_jobs', 'tmj_nyc_art', 'tmj_nwk_labor',
       'tmj_roc_eng', 'tmj_NAS_transp', 'CA_ROC_Jobs2', 'tmj_nyc_cosmo',
       'tmj_RAM_edu', 'tmj_NAS_health', 'tmj_nwk_facmgmt', 'tmj_NAS_facmgmt',
       'GodivaJobs', 'tmj_RAM_acct', 'tmj_roc_health', 'tmj_nyc_itdb',
       'tmj_nwk_transp', 'tmj_nwk_edu', 'tmj_RAM_retail', 'tmj_RAM_mgmt',
       'tmj_NAS_socsci', 'tmj_nwk_prod', 'tmj_nyc_realest', 'tmj_NJ_facmgmt', 'tmj_njn_retail',
       'tmj_roc_nursing', 'tmj_nwk_finance', 'Fly_Sistah', 'tmj_NYC_skltrd',
       'tmj_nya_nursing', 'tmj_nwk_web', 'tmj_roc_cstsrv', 'tmj_nys_jobs',
       'tmj_njc_hrta', 'tmj_NAS_retail', 'tmj_roc_hrta',
       'ChurchCathy', 'tmj_NAS_secure', 'tmj_RAM_art', 'tmj_NAS_labor',
       'tmj_NAS_physici', 'tmj_nwk_skltrd', 'tmj_roc_sales', 'tmj_nwk_purch',
       'tmj_NYS_NURSING', 'tmj_nwk_physici', 'tmj_njn_hrta', 'Mezikenyc',
       'JCI_Jobs', 'tmj_NAS_acct', 'tmj_NYC_gensci', 'tmj_nya_eng',
       'tmj_nwk_nonprft', 'tmj_roc_manuf', 'nwkitsupport', 'tmj_NY_LABOR',
       'tmj_ny_mgmt', 'tmj_njn_health',
       'tmj_nj_hrta', 'tmj_NAS_cstsrv', 'tmj_nwk_it', 'tmj_nya_transp',
       'tmj_ct_nursing', 'tmj_NJ_sales', 'tmj_nya_acct',
       'nwkmanuf', 'tmj_nys_cstsrv', 'tmj_njn_nursing',
       'tmj_njn_mgmt', 'cbwaszak', 'tmj_NAS_cler', 'tmj_RAM_auto',
       'tmj_nwk_art'
    'WWEWomenMatter',
    #photos
    'ThomGambino', 'Xsanthemum', 'francesco212', 'Empressjurnee',
       'andrerivera801', 'janice830', 'Ingridebap', 'StevieSoFetch_',
       'EstebanDaHost', 'graceyhanderson', 'bccdny', 'brian_wood_'
       ]

In [29]:
a.data_preprocessing('Sat Jun 01', channels_not_to_consider)

In [32]:
a.tweets_frame['created_at'].value_counts()

Sat Jun 01 16:55:57 +0000 2019    8
Sat Jun 01 21:39:15 +0000 2019    7
Sat Jun 01 02:56:51 +0000 2019    6
Sat Jun 01 03:24:33 +0000 2019    6
Sat Jun 01 17:06:55 +0000 2019    6
Sat Jun 01 19:19:02 +0000 2019    6
Sat Jun 01 16:16:54 +0000 2019    6
Sat Jun 01 18:40:10 +0000 2019    6
Sat Jun 01 14:43:27 +0000 2019    6
Sat Jun 01 23:37:29 +0000 2019    6
Sat Jun 01 00:50:53 +0000 2019    6
Sat Jun 01 21:01:13 +0000 2019    6
Sat Jun 01 00:28:40 +0000 2019    6
Sat Jun 01 20:49:51 +0000 2019    6
Sat Jun 01 02:13:29 +0000 2019    6
Sat Jun 01 16:51:04 +0000 2019    6
Sat Jun 01 15:17:23 +0000 2019    5
Sat Jun 01 19:46:56 +0000 2019    5
Sat Jun 01 13:57:51 +0000 2019    5
Sat Jun 01 02:52:13 +0000 2019    5
Sat Jun 01 00:49:17 +0000 2019    5
Sat Jun 01 13:18:48 +0000 2019    5
Sat Jun 01 18:55:23 +0000 2019    5
Sat Jun 01 22:17:14 +0000 2019    5
Sat Jun 01 21:16:11 +0000 2019    5
Sat Jun 01 23:44:39 +0000 2019    5
Sat Jun 01 16:26:32 +0000 2019    5
Sat Jun 01 17:59:11 +0000 20

In [33]:
len(a.tweets_frame)

53875

In [31]:
a.LDA(20)


Topic #0:
link:1.0 fashion:0.9745 new:0.9715 newyork:0.797 bio:0.7146 amp:0.5999 t-shirt:0.5848 latest:0.5782 show:0.5707 classic:0.5641

Topic #1:
one:1.0 time:0.9434 like:0.9012 ive:0.728 get:0.6884 year:0.688 last:0.6632 day:0.6307 good:0.5629 today:0.5565

Topic #2:
bronx:1.0 like:0.9875 avenue:0.987 one:0.6976 get:0.6975 nxttakeover:0.693 long:0.655 class:0.5378 tonight:0.5173 girl:0.514

Topic #3:
like:1.0 dont:0.4947 get:0.3808 people:0.3473 look:0.3252 lol:0.3077 know:0.2855 think:0.2772 even:0.2631 would:0.2464

Topic #4:
america:1.0 bill:0.8836 hbo:0.8532 tit:0.8227 sayin:0.8166 claiming:0.8043 unofficial:0.7954 realtime:0.783 readthatagain:0.7629 👬👫👭:0.7629

Topic #5:
happy:1.0 pride:0.8132 month:0.8072 day:0.6471 today:0.5114 amp:0.448 love:0.4464 june:0.3929 great:0.3724 thank:0.35

Topic #6:
like:1.0 dont:0.8217 get:0.72 know:0.5359 shit:0.4898 love:0.4162 one:0.4079 time:0.3957 really:0.3898 people:0.3745

Topic #7:
case:1.0 freeship:0.9669 liverpool:0.7813 champion:0.7

In [None]:
#https://www.google.com/search?q=liverpool+%D0%BC%D0%B0%D1%82%D1%87&rlz=1C1CHZL_enUA833UA833&oq=liverpool+%D0%BC%D0%B0%D1%82%D1%87+&aqs=chrome..69i57j0l5.8428j0j7&sourceid=chrome&ie=UTF-8

-------------------------------------

In [13]:
alternative_date = Tweets_LDA()

In [14]:
alternative_date.load_data('get-tweets-by-geolocation/data/new_york_training_tweets_15_06.csv')

In [15]:
len(alternative_date.tweets_frame)

332548

In [16]:
alternative_date.tweets_frame['created_at'].map(lambda x: x[:10]).value_counts()

Thu Jun 06    67546
Sat Jun 01    59087
Fri May 31    46951
Wed Jun 05    38703
Wed Jun 12    29989
Sun Jun 02    29291
Thu Jun 13    27292
Fri Jun 07    16485
Tue Jun 04    14549
Thu May 30     2655
Name: created_at, dtype: int64

In [79]:
alternative_date.data_preprocessing('Thu Jun 13',channels_not_to_consider)

In [81]:
len(alternative_date.tweets_frame)

24971

In [82]:
alternative_date.LDA(6)


Topic #0:
trump:1.0 would:0.6534 get:0.6515 people:0.594 know:0.5672 dont:0.5633 one:0.5334 amp:0.5299 like:0.4981 say:0.4798

Topic #1:
amp:1.0 printing:0.6949 free:0.6494 card:0.582 business:0.5169 full:0.4942 new:0.4853 print:0.4441 price:0.4436 flyer:0.3729

Topic #2:
size:1.0 birthday:0.8868 rare:0.8036 white:0.7967 bid:0.7441 party:0.7313 welcome:0.7195 nike:0.711 hurry-up:0.7064 preowned:0.6657

Topic #3:
construction:1.0 town:0.9676 bloomfield:0.8781 direction:0.7982 line:0.7671 pulse:0.6917 year:0.4173 street:0.3945 center:0.3859 road:0.3562

Topic #4:
mph:1.0 wind:0.885 humidity:0.7158 -gt:0.6552 weather:0.5592 pressure:0.5485 sky:0.5244 cloud:0.507 clear:0.5029 current:0.4776

Topic #5:
cup:1.0 blue:0.8811 stanley:0.6443 boston:0.5735 stanleycup:0.5678 game:0.5438 win:0.524 team:0.464 louis:0.372 year:0.3688

Topic #6:
time:1.0 amp:0.5801 one:0.4295 real:0.3862 would:0.3442 create:0.2982 year:0.2931 people:0.2919 wednesday:0.2906 need:0.2825

Topic #7:
game:1.0 love:0.5552 

In [83]:
alternative_date.LDA(6)


Topic #0:
like:1.0 dont:0.6956 get:0.6158 one:0.6083 love:0.4831 know:0.4762 time:0.4445 people:0.4058 good:0.3962 see:0.3933

Topic #1:
new:1.0 york:0.6643 nyc:0.2856 day:0.2594 amp:0.2339 night:0.1725 time:0.1715 city:0.1676 great:0.1512 brooklyn:0.1476

Topic #2:
white:1.0 case:0.9274 size:0.9074 freeship:0.7902 rare:0.7414 bid:0.654 nike:0.6265 hurry-up:0.6208 preowned:0.585 amp:0.5696

Topic #3:
birthday:1.0 mph:0.8464 party:0.7657 welcome:0.7554 wind:0.726 humidity:0.6257 bash:0.6252 elbaeverlasting:0.5923 moneymachinewednesdays:0.5923 -gt:0.5707

Topic #4:
blue:1.0 cup:0.9341 game:0.8701 tonight:0.6438 stanleycup:0.6419 stanley:0.592 today:0.5614 boston:0.5469 win:0.5324 team:0.5231

Topic #5:
like:1.0 amp:0.9911 get:0.8736 dont:0.841 trump:0.7667 people:0.6034 know:0.5431 would:0.5285 time:0.528 want:0.5156


In [None]:
#https://ru.wikipedia.org/wiki/%D0%9F%D0%BB%D0%B5%D0%B9-%D0%BE%D1%84%D1%84_%D0%9A%D1%83%D0%B1%D0%BA%D0%B0_%D0%A1%D1%82%D1%8D%D0%BD%D0%BB%D0%B8_2019

#### Channels filtration

In [59]:
a.tweets_frame[a.tweets_frame['tweets_processed'].map(lambda x: 'traffic' in x 
                                                      or 'lane' in x 
                                                      or 'incident' in x 
                                                      or 'blocked' in x)]\
                                                    ['screen_name'].value_counts().index[:30]

Index(['richmintz', 'SeannyFK', 'JoshyTweetz', 'Lilsunshinegurl', 'hBencee',
       'MichaelRMyers5', 'scumbagking__', 'BrownBagCycling', 'dchambersDPM',
       'Ernzcognito', 'TheOnlyMikeQ', 'oheydiids', 'AshleyKrista', 'valoria_z',
       'Eli_Rivs', 'VHLiv', 'pair_up_', 'jaydestro', 'so_many_amys', 'bjota13',
       'Graphix_Divine7', 'johnnybebad6661', 'hannahcomedian',
       'JoeyIannitelli', 'RichardPriem', '511nyAdirondack', 'TotalTrafficPHL',
       'LivingLegend_23', 'TKYSK8R', 'GormoExJourno'],
      dtype='object')

In [62]:
a.tweets_frame[a.tweets_frame['tweets_processed'].map(lambda x: 'job' in x 
                                                      or 'hiring' in x 
                                                      or 'link' in x
                                                     or 'apply' in x)]\
                                                    ['screen_name'].value_counts().index[:10]

Index(['DianaLaRosa3', 'AppelHowie', 'ArrestALSNow', 'tmj_njs_hrta', 'Iam_ALW',
       'tmj_nyc_pharm', 'tmj_RAM_itpm', 'tmj_cte_sales', 'NYPDPSA8',
       'tmj_nwk_defben'],
      dtype='object')

In [63]:
a.tweets_frame[a.tweets_frame['tweets_processed'].map(lambda x: 'photo' in x 
                                                      or 'new' in x 
                                                      or 'york' in x
                                                     or 'amp' in x)]\
                                                    ['screen_name'].value_counts().index[:12]

Index(['Guy173', 'Aussiethunda', 'DonMcKenzie', 'cpklapper', 'sdmack',
       'andresflava', 'JustJoeyLopez', 'emoleechen', 'oalgarin',
       'CharlesJHernan2', 'HippieHooper', 'tariqalhadi'],
      dtype='object')

In [64]:
a.tweets_frame[a.tweets_frame['tweets_processed'].map(lambda x: 'like' in x )]\
                                                    ['screen_name'].value_counts().index[:5]

Index(['WWEWomenMatter', 'almighty_red', 'riordainn', 'hammertime1009',
       'kareemthagreat'],
      dtype='object')