### Libraries

In [97]:
import os
import pandas as pd
import re
pd.options.display.float_format = '{:.2f}'.format
import unicodedata
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [12]:
os.chdir(r"D:")
data = pd.read_csv('iPhone.csv',parse_dates = True)

In [15]:
data.head()

Unnamed: 0,tweet_id,is_retweet,likes,replies,retweeter_userid,retweets,text,timestamp,timestamp_epochs,user_id
0,1001,0,0,0,,0,iDrop News is giving away a free iPhone 11 in ...,10/30/2019 23:59,10/30/2019 23:59,1376342479.0
1,1002,0,0,0,,0,I entered @skinit's iPhone 11 Giveaway for a c...,10/30/2019 23:59,10/30/2019 23:59,118994120.0
2,1003,0,0,0,,0,Enter for Your Chance to Win an iPhone 11 256G...,10/30/2019 23:59,10/30/2019 23:59,1.05668e+18
3,1004,0,56,1,,2,Okay the new iPhone 11 Pro is _really_ good at...,10/30/2019 23:59,10/30/2019 23:59,17113293.0
4,1005,0,1,1,,0,You know it brooo @iPhone11,10/30/2019 23:59,10/30/2019 23:59,1.02368e+18


### Data assessment

In [16]:
# converting datetime column to datetime datatype
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [17]:
# record assessment before removing duplicates
print(data['user_id'].nunique())
print(data['tweet_id'].nunique())
print(data['text'].nunique())

10807
33115
14242


In [29]:
# removing duplicate tweets from dataset
new_data=data.drop_duplicates(subset=['text'],keep='first')
print(new_data['user_id'].nunique())
print(new_data['tweet_id'].nunique())
print(new_data['text'].nunique())
print(new_data.shape)

10169
14242
14242
(14242, 10)


### Cleaning with Regex

In [32]:
# Before cleaning
new_data['text'].iloc[51]

'iPhone 11 Pro Max Case Flower Ultra Hybrid Clear Slim Cover for Apple\n\nbuy from\nhttps://ebay.com.au/itm/202781348904\xa0…\n#iPhone11 #iPhone11Pro #iPhone11ProMax #iphonecase #iphone11case #iphone11procase #iphone11promaxcase #iphone #iphoneflowercase #iphonecases #floracase #iphonelacecasepic.twitter.com/tt19D6jFqO'

In [86]:
def preprocessor(text):
    '''
    definition to remove : HTML markups,
    http urls,special characters & emoticons
    and converting into a lowercase
    
    '''
    
    #substitute HTML markups
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    
    #lowecase and join emoticons
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    
    #remove https://url
    text = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text)
    
    #remove line breaks
    text = re.sub("\n", "", text)
    
    #non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    
    return text

In [81]:
# Clean text
preprocessor(new_data['text'].iloc[51])

'iphone 11 pro max case flower ultra hybrid clear slim cover for apple buy from https ebay com au itm 202781348904 iphone11 iphone11pro iphone11promax iphonecase iphone11case iphone11procase iphone11promaxcase iphone iphoneflowercase iphonecases floracase iphonelacecasepic twitter com tt19d6jfqo'

In [87]:
new_data['text'] = new_data['text'].apply(preprocessor)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [83]:
new_data.head(10)

Unnamed: 0,tweet_id,is_retweet,likes,replies,retweeter_userid,retweets,text,timestamp,timestamp_epochs,user_id
0,1001,0,0,0,,0,idrop news is giving away a free iphone 11 in ...,2019-10-30 23:59:00,10/30/2019 23:59,1376342479.0
1,1002,0,0,0,,0,i entered skinit s iphone 11 giveaway for a ch...,2019-10-30 23:59:00,10/30/2019 23:59,118994120.0
2,1003,0,0,0,,0,enter for your chance to win an iphone 11 256g...,2019-10-30 23:59:00,10/30/2019 23:59,1.05668e+18
3,1004,0,56,1,,2,okay the new iphone 11 pro is _really_ good at...,2019-10-30 23:59:00,10/30/2019 23:59,17113293.0
4,1005,0,1,1,,0,you know it brooo iphone11,2019-10-30 23:59:00,10/30/2019 23:59,1.02368e+18
5,1006,0,82,3,,13,nyc iphone 11 pro max dope pic twitter com zwv...,2019-10-30 23:59:00,10/30/2019 23:59,40538510.0
6,1007,0,1,0,,0,all i want for christmas is iphone 11 haha,2019-10-30 23:58:00,10/30/2019 23:58,1.00136e+18
7,1008,0,0,0,,0,i would love to win or own an iphone 11 or any...,2019-10-30 23:57:00,10/30/2019 23:57,1.14675e+18
8,1009,0,0,0,,0,could the apple card be the key to iphone 11 s...,2019-10-30 23:57:00,10/30/2019 23:57,29269626.0
9,1010,0,0,0,,0,no offense but tener un iphone 11 actually suc...,2019-10-30 23:57:00,10/30/2019 23:57,3266529871.0


In [84]:
# removing duplicate tweets from dataset
new_data=data.drop_duplicates(subset=['text'],keep='first')
print(new_data['user_id'].nunique())
print(new_data['tweet_id'].nunique())
print(new_data['text'].nunique())
print(new_data.shape)

10169
14242
14242
(14242, 10)


### Tokenization -- Stemming -- Stopword removal

In [95]:
# NLTK 
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Thatoi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [92]:
# definition to tokenize sentences into tokens
def tokenizer_porter (text):
    return [porter.stem(word) for word in text.split()]

In [93]:
new_data['token_text']=new_data['text'].apply(tokenizer_porter)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [113]:
# stop words in english to be removed
stop = stopwords.words('english')

In [150]:
# definition to remove stop words
def remove_stopwords(text):
    list_words= []
    for w in text: 
        if w not in stop:
            list_words.append(w)
    return (list_words)

In [151]:
for i in new_data.index:
    new_data.at[i,'processed_word']= remove_stopwords(new_data.at[i,'token_text'])

In [152]:
new_data[['processed_word','token_text']]

Unnamed: 0,processed_word,token_text
0,"[idrop, news, give, away, free, iphon, 11, feb...","[idrop, news, is, give, away, a, free, iphon, ..."
1,"[enter, skinit, iphon, 11, giveaway, chanc, wi...","[i, enter, skinit, s, iphon, 11, giveaway, for..."
2,"[enter, chanc, win, iphon, 11, 256gb, airpod, ...","[enter, for, your, chanc, to, win, an, iphon, ..."
3,"[okay, new, iphon, 11, pro, _really_, good, vi...","[okay, the, new, iphon, 11, pro, is, _really_,..."
4,"[know, brooo, iphone11]","[you, know, it, brooo, iphone11]"
5,"[nyc, iphon, 11, pro, max, dope, pic, twitter,...","[nyc, iphon, 11, pro, max, dope, pic, twitter,..."
6,"[want, christma, iphon, 11, haha]","[all, i, want, for, christma, is, iphon, 11, h..."
7,"[would, love, win, iphon, 11, ani, iphon, mode...","[i, would, love, to, win, or, own, an, iphon, ..."
8,"[could, appl, card, key, iphon, 11, sale, appl...","[could, the, appl, card, be, the, key, to, iph..."
9,"[offens, tener, un, iphon, 11, actual, suck, h...","[no, offens, but, tener, un, iphon, 11, actual..."
