In [79]:
import numpy as np
import pandas as pd
import re
from string import punctuation
from time import process_time
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
import preprocessor as p

In [80]:
class Preprocess_Data():
    
    # ----------------------------------------- Constructor -----------------------------------------
    
    def __init__(self):
        self.punctuation = set(punctuation)
        self.lemmatizer = WordNetLemmatizer()
        p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
        self.stopword_list = set(stopwords.words('english'))
        unwanted_stopwords = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', 'what', 'which', 'who',
                              'whom', 'why', 'how', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
                              "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
                              "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
                              "shouldn't", 'wasn',"wasn't",'weren', "weren't", 'won', "won't", 'wouldn',
                              "wouldn't", 'don', "don't"}

        self.stopword_list = [x for x in self.stopword_list if x not in unwanted_stopwords]
       
    
    # ----------------------------------------- Read Data -----------------------------------------
    
    def read_data(self, path):
        df = pd.read_csv(path, usecols=['user_id', 'tweet', 'hashtags'])
        return df
    
    
    # ----------------------------------------- Clean Data -----------------------------------------
    
    def clean_data(self, tweets):
        cleaned_tweets = []
        for text in tweets:
            # Clean tweet
            text = p.clean(text)
            # Remove special characters
            text = re.sub(r'(\\x(.)*)', '',text)
            text = re.sub(r'\\n|\\t|\\n\\n', ' ', text)
            text = re.sub(r"b'RT|b'|b RT|b\"RT|b", "", text)
            text = re.sub("[@#$%^&*)(}{|/><=+=_:\"\\\\]+"," ",text).strip() 
            #Remove punctuation marks
            text = "".join(x for x in text if x not in self.punctuation)
            # Remove accented words
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            # Splitting Hashtag words
            text = " ".join([x for x in re.split('([A-Z][a-z]+)', text) if x])
            # Remove long spaces
            pattern = r'^\s*|\s\s*'
            text = re.sub(pattern, ' ', text).strip()
            # Remove numbers
            text = re.sub('[0-9]+', '', text)
            
            cleaned_tweets.append(text)
        
        return cleaned_tweets
    
    
    # ----------------------------------------- Preprocess Data -----------------------------------------
    
    def preprocess_data(self, tweets):
        preprocessed_tweets = []
        for text in tweets:
            
            # Remove stopwords
            text = " ".join(x for x in text.lower().split() if x not in self.stopword_list)
            
            # Text Lemmatization
            lemmatized_words = []
            for word in text.split():
                word1 = self.lemmatizer.lemmatize(word, pos="n")
                word2 = self.lemmatizer.lemmatize(word1, pos="v")
                word3 = self.lemmatizer.lemmatize(word2, pos=("a"))
                lemmatized_words.append(word3)
            text = " ".join(x for x in lemmatized_words)
            
            preprocessed_tweets.append(text)
            
        return preprocessed_tweets

In [81]:
pre = Preprocess_Data()

In [82]:
input_path = './dataset/raw_dataset/khalistan_main.csv'
output_path = './processed_tweets.csv'

data = pre.read_data(input_path)
data.head()

Unnamed: 0,user_id,tweet,hashtags
0,b'ChopraDilpreet',b'This #PIL filed against #Twitter for promoti...,"[{'text': 'PIL', 'indices': [5, 9]}, {'text': ..."
1,b'jmez1010',"b""RT @Jasleen_Kaur11: \xf0\x9f\xa4\xa3\xf0\x9f...","[{'text': 'Kashmir_With_India', 'indices': [85..."
2,b'jmez1010',"b""RT @DarrenVirk: #Pakistan around partition a...","[{'text': 'Pakistan', 'indices': [16, 25]}, {'..."
3,b'jmez1010',b'RT @Harbaks21769227: #Pannun has no shame le...,"[{'text': 'Pannun', 'indices': [21, 28]}, {'te..."
4,b'jmez1010',"b""RT @Jasleen_Kaur11: Pannun, a Pakistani pupp...",[]


In [83]:
raw_tweets = data.tweet.values.tolist()
raw_tweets[:2]

["b'This #PIL filed against #Twitter for promoting #Khalistan is a good initiative. #SikhCommunity is just trying to sa\\xe2\\x80\\xa6 https://t.co/w8OrEmslpu'",
 'b"RT @Jasleen_Kaur11: \\xf0\\x9f\\xa4\\xa3\\xf0\\x9f\\x98\\x82Now Coming this is a next plan Pakistan  puppet pannun team\'s. #Kashmir_With_India #ShameOnSFJ #ShameOnPannun #Khalis\\xe2\\x80\\xa6"']

In [84]:
cleaned_tweets = pre.clean_data(raw_tweets)
cleaned_tweets[:2]

['This PIL filed against Twitter for promoting Khalistan is a good initiative Sikh Community is just trying to sa',
 '']

In [85]:
preprocess_tweets = pre.preprocess_data(cleaned_tweets)
preprocess_tweets[:2]

['pil file twitter promote khalistan good initiative sikh community try sa',
 '']

In [86]:
f2 = pd.DataFrame(columns = ['tweet','hashtags', 'label'])

f2['tweet'] = pd.Series(preprocess_tweets)
f2['hashtags'] = data.tweet.apply(lambda x: re.findall(r"#(\w+)", x))
f2['label'] = 0;

f2.head()

Unnamed: 0,tweet,hashtags,label
0,pil file twitter promote khalistan good initia...,"[PIL, Twitter, Khalistan, SikhCommunity]",0
1,,"[Kashmir_With_India, ShameOnSFJ, ShameOnPannun...",0
2,pakistan around partition weve never see efore...,"[Pakistan, Gurdwara]",0
3,pannun no shame leave,"[Pannun, Khalistan]",0
4,pannun pakistani puppet who work pakistan see ...,[Shame],0


In [87]:
for index, row in f2.iterrows():
    # Check if the 'B' column is empty
    if (row['tweet'] == ''):
        # Drop the row
        f2 = f2.drop(index)

In [88]:
# f2.to_csv('./original/train.csv')
f2.head(10)

Unnamed: 0,tweet,hashtags,label
0,pil file twitter promote khalistan good initia...,"[PIL, Twitter, Khalistan, SikhCommunity]",0
2,pakistan around partition weve never see efore...,"[Pakistan, Gurdwara]",0
3,pannun no shame leave,"[Pannun, Khalistan]",0
4,pannun pakistani puppet who work pakistan see ...,[Shame],0
5,pil file twitter promote khalistan good initia...,"[PIL, Twitter, Khalistan, SikhCommunity]",0
7,pakistan around partition weve never see efore...,"[Pakistan, Gurdwara]",0
8,pannun no shame leave,"[Pannun, Khalistan]",0
9,pannun pakistani puppet who work pakistan see ...,[Shame],0
10,no khalistan khai garage dear sikh quick reali...,"[Khalistan, Khai]",0
11,pannun not listen real sikh imaginary world do...,"[Pannun, RealSikhs, SikhCommunity]",0


In [89]:
x = f2.drop_duplicates(subset=['tweet'])
x.reset_index(inplace=True)
x = x.drop('index', axis=1)
x.to_csv('./original/train.csv')
x.shape

(1172, 3)