In [2]:
import numpy as np
import pandas as pd
#from normalise import normalise
import multiprocessing as mp
import re
import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator

nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs
        self.emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)


    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        #normalized_text = self._normalize(text)
        removed_add = self._remove_add(text)
        removed_spaces = self._remove_spaces(removed_add)
        removed_emoji = self._remove_emoji(removed_spaces)
        doc = nlp(removed_emoji)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text
        
    def _remove_add(self, text):
        return re.sub(r"(?:\@|https?\://)\S+", "", text)
    
    def _remove_spaces(self, text):
        return ' '.join(text.split()).lower()
    
    def _remove_emoji(self, text):
        
        return self.emoji_pattern.sub(r'', text)
        

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc if len(t.lemma_) > 2])

In [None]:
string.punctuation

In [3]:
path = r'/Users/andrei/PycharmProjects/TWITTER_1/raw_data'                     # use your path


data = pd.read_csv(path+'/tweets_only.csv',\
                   sep='\t', lineterminator ='\n')
data.head()

Unnamed: 0,id,username,tweet
0,1.349144e+18,qod_gme,@MaryTravels78 If it’s any consolation ... it ...
1,1.349144e+18,trey50daniel,@jennkruza @RodAlzmann About to be the stimmy ...
2,1.349143e+18,marytravels78,"@QOD_GME Of not, there should be..."
3,1.34914e+18,dantzfrye,I start my day off everyday with 0$ But I won’...
4,1.349139e+18,tmyrbrgh,@investing_city About to to be $GME Gamestop


In [81]:
#tweets = data[['id','date_time','user_id','username','tweet']].copy(deep=True)

In [4]:
processor = TextPreprocessor(n_jobs = -1)

In [5]:
import time

start = time.time()

data['proceed_tweet']= processor.transform(data['tweet'])

a = time.time() - start

print('Время затраченное на предобработку твитов:', a)

KeyboardInterrupt: 

In [None]:
print('Время затраченное на предобработку твитов:', a)

In [None]:
data.sample(10)

In [None]:
data.to_csv('data/ONLY_TWEETS_wo_emoji.csv', sep='\t', index=False)

In [121]:
data.sample(10000).to_csv('data_main/TEST_TWEETS_wo_emoji.csv', sep='\t', index=False)