In [34]:
import numpy as np
import pandas as pd
import re
from string import punctuation
from time import process_time
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
import preprocessor as p

In [35]:
class Preprocess_Data():
    
    # ----------------------------------------- Constructor -----------------------------------------
    
    def __init__(self):
        self.punctuation = set(punctuation)
        self.lemmatizer = WordNetLemmatizer()
        p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
        self.stopword_list = set(stopwords.words('english'))
        unwanted_stopwords = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', 'what', 'which', 'who',
                              'whom', 'why', 'how', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
                              "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
                              "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
                              "shouldn't", 'wasn',"wasn't",'weren', "weren't", 'won', "won't", 'wouldn',
                              "wouldn't", 'don', "don't"}

        self.stopword_list = [x for x in self.stopword_list if x not in unwanted_stopwords]
       
    
    # ----------------------------------------- Read Data -----------------------------------------
    
    def read_data(self, path):
        df = pd.read_csv(path, usecols=['user_screen_name', 'text', 'hashtags'])
        return df
    
    
    # ----------------------------------------- Clean Data -----------------------------------------
    
    def clean_data(self, tweets):
        cleaned_tweets = []
        for text in tweets:
            # Clean tweet
            text = p.clean(text)
            # Remove special characters
            text = re.sub(r'(\\x(.)*)', '',text)
            text = re.sub(r'\\n|\\t|\\n\\n', ' ', text)
            text = re.sub("[@#$%^&*)(}{|/><=+=_:\"\\\\]+"," ",text).strip() 
            #Remove punctuation marks
            text = "".join(x for x in text if x not in self.punctuation)
            # Remove accented words
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Splitting Hashtag words
            text = " ".join([x for x in re.split('([A-Z][a-z]+)', text) if x])
            # Remove long spaces
            pattern = r'^\s*|\s\s*'
            text = re.sub(pattern, ' ', text).strip()
            # Remove numbers
            text = re.sub('[0-9]+', '', text)
            
            cleaned_tweets.append(text)
        
        return cleaned_tweets
    
    
    # ----------------------------------------- Preprocess Data -----------------------------------------
    
    def preprocess_data(self, tweets):
        preprocessed_tweets = []
        for text in tweets:
            
            # Remove stopwords
            text = " ".join(x for x in text.lower().split() if x not in self.stopword_list)
            
            # Text Lemmatization
            lemmatized_words = []
            for word in text.split():
                word1 = self.lemmatizer.lemmatize(word, pos="n")
                word2 = self.lemmatizer.lemmatize(word1, pos="v")
                word3 = self.lemmatizer.lemmatize(word2, pos=("a"))
                lemmatized_words.append(word3)
            text = " ".join(x for x in lemmatized_words)
            
            preprocessed_tweets.append(text)
            
        return preprocessed_tweets

In [36]:
pre = Preprocess_Data()

In [37]:
input_path = './raw_tweets.csv'
output_path = './processed_tweets.csv'

data = pre.read_data(input_path)
data.head()

Unnamed: 0,user_screen_name,text,hashtags
0,Bhupinder295,RT @Tractor2twitr_P: RO systems which wr once ...,"['FarmersProtest', 'ZiraSanjhaMorcha']"
1,Cultural_Pendu,RT @_MohitGahlot_: Those who talk of doubling ...,[]
2,Jag22946452,RT @Tractor2twitr_P: RO systems which wr once ...,"['FarmersProtest', 'ZiraSanjhaMorcha']"
3,Aman_Kaur45,RT @_MohitGahlot_: Those who talk of doubling ...,[]
4,Sandhu_Deep1,RT @_MohitGahlot_: Those who talk of doubling ...,[]


In [38]:
raw_tweets = data.text.values.tolist()
raw_tweets[:2]

['RT @Tractor2twitr_P: RO systems which wr once part of #FarmersProtest r being installed in #ZiraSanjhaMorcha .\n\nToxic pollution by Malbros…',
 'RT @_MohitGahlot_: Those who talk of doubling the income, are not even able to give their respect fund to the farmers...\n\nApart from the po…']

In [39]:
cleaned_tweets = pre.clean_data(raw_tweets)
cleaned_tweets[:2]

['RO systems which wr once part of Farmers Protest r being installed in Zira Sanjha Morcha Toxic pollution by Malbros',
 'Those who talk of doubling the income are not even able to give their respect fund to the farmers Apart from the po']

In [40]:
preprocess_tweets = pre.preprocess_data(cleaned_tweets)
preprocess_tweets[:2]

['ro system which wr part farmer protest r instal zira sanjha morcha toxic pollution malbros',
 'who talk double income not even able give respect fund farmer apart po']

In [41]:
f1 = pd.DataFrame(columns = ['tweet','hashtags'])

f1['tweet'] = pd.Series(preprocess_tweets)
f1['hashtags'] = data['hashtags']

f1.head()

Unnamed: 0,tweet,hashtags
0,ro system which wr part farmer protest r insta...,"['FarmersProtest', 'ZiraSanjhaMorcha']"
1,who talk double income not even able give resp...,[]
2,ro system which wr part farmer protest r insta...,"['FarmersProtest', 'ZiraSanjhaMorcha']"
3,who talk double income not even able give resp...,[]
4,who talk double income not even able give resp...,[]


In [42]:
for index, row in f1.iterrows():
    if (row['tweet'] == ''):
        f1 = f1.drop(index)

In [43]:
# f1.to_csv('./original/test.csv')
f1.head()

Unnamed: 0,tweet,hashtags
0,ro system which wr part farmer protest r insta...,"['FarmersProtest', 'ZiraSanjhaMorcha']"
1,who talk double income not even able give resp...,[]
2,ro system which wr part farmer protest r insta...,"['FarmersProtest', 'ZiraSanjhaMorcha']"
3,who talk double income not even able give resp...,[]
4,who talk double income not even able give resp...,[]


In [45]:
x = f1.drop_duplicates(subset=['hashtags'])
x.reset_index(inplace=True)
x = x.drop('index', axis=1)
x.to_csv('./original/test.csv')
x.shape

(180, 2)