# 1. Importing libraries

In [11]:
import pandas as pd

from textblob import Word

import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


# 2. Importing dataset

In [12]:
col_list = ["date", "tweet"]

df = pd.read_csv("dataset/basic-cleaning.csv", usecols=col_list)


In [13]:
# Print dataframe
df.style.set_properties(subset=['tweet'], **{'width': '800px'}).hide_index()


date,tweet
2019-04-01,Honestly same rn. His behavior is excused so much when it’s unacceptable I want pizza but it’s so late idk what to do.
2019-04-02,"FAIR The boob aches. Just hanging from my chest makes them hurt so baddddd Do you ever think of something really mean you did a long time ago and it physically pains you? Actual footage of me. This is so cute! Congrats on the new tat! Help, I’m depressed and all I’ve wanted to eat for weeks is mac n cheese. I’m a Libra and that’s why I wanna get it out. Heck yes you are!!! ❤️ Smort *laughs in Pittsburgh* Pissy emailing is so fun. I have been the got it person tho lol YASSSS ❤️🧡💛💚💙💜💖 Both are so cute!!!"
2019-04-03,Ok! Had to try :)) Are there any left?!?
2019-04-04,"I love that Zu is such an integral part of my being, like I’m a real ass mom. Nah you’re just too close to see how great you are Foods, Rihanna, and an incredible shoe collection Yeah lol gotta just own it at that point DnD, k-pop, cute face Bohemian style, record store, AMAZING selfie game. I was running late this morning, but you betcher ass I still stopped to pet a teeny tiny Pomeranian puppy on my way to the bus stop. PRIORITIES. There’s a song that helps lol I only remember the first 150 bc I am weak. Lol they are all the tiny battle monsters ClefairyArticuno TogepiHaunterElectabuzzRaichu IvysaurNidorinaExeggcute"
2019-04-05,"This is what gets me out of bed in the morning tbh Lol it is certainly a lot. Lucy lawless, Rachel weiscz, endless stream of stabbing incredible women Captain Marvel, awesome fashion, happy selfies Kim K reaction pics, reading people within an inch of their life, and Kanye I could run a sweet art business from home js ❤️🧡💛💚💙💜 Also iPhone has a parasite🦠 emoji??? Hahaha like a parasite... wait no"
2019-04-06,"Ppl like this get so defensive when you tell them to just watch the movie. It’s like I know the same amount as you. I want one of those hot toys dolls real bad rn and idk why. Send help, I’m gonna retail therapy myself to death."
2019-04-08,
2019-04-09,NO WAY I just watched the Hulu special on Nxium the other day!!!! Basically if I die DRAG HER AT MY FUNERAL This bus driver is hitting so many curbs and has already slammed the brakes so hard I bashed my knee on the seat. Kinda sad we didn’t get to see Joy Behar commit assault and battery on live tv
2019-04-10,
2019-04-11,I had a gross dog story to tell and I asked my coworker if he wanted to hear something gross bc i was thinking about it NOT THINKING THAT I JUST CAME OUT OF THE BATHROOM AND IT WOULD BE WEIRD and now I’m mortified and want to die.


# 3. NLTK preprocessing

In [14]:
# drop NaN

df.dropna(subset = ["tweet"], inplace=True)
df

Unnamed: 0,date,tweet
0,2019-04-01,Honestly same rn. His behavior is excused so m...
1,2019-04-02,FAIR The boob aches. Just hanging from my ch...
2,2019-04-03,Ok! Had to try :)) Are there any left?!?
3,2019-04-04,I love that Zu is such an integral part of my ...
4,2019-04-05,This is what gets me out of bed in the mornin...
...,...,...
240,2020-03-04,P: how do you feel?Me: Fast Car by Tracy Chapm...
241,2020-03-05,Um-the cutest?!?!
244,2020-03-09,
246,2020-03-12,Top tier taste right there.


In [15]:

# 3.1 Removing stopwords
stop = stopwords.words('english')
df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# 3.2 Stemming
st = PorterStemmer()
df['tweet'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

# 3.5 Lemmatization
df['tweet'] = df['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
# 3.6 Count the tweet words in each row 

df['word_count'] = df['tweet'].apply(lambda x: len(str(x).split(" ")))

# 3.7 Tokenization
df['tokens'] = df['tweet'].apply(lambda x: str(x).split(" "))

# 3.8 Count the tokenized words in each tweet
df['tokens_count'] = df['tokens'].apply(lambda x: len(str(x).split(" ")))



In [16]:
# write to file after final preprocessing 
#df.to_csv("dataset/nltk-preprocessing.csv", mode='w', index=None)
df.to_csv("dataset/final.csv", mode='w', index=None)

In [17]:
# Print
df.style.set_properties(subset=['tweet','tokens'], **{'width': '800px'}).hide_index()


date,tweet,word_count,tokens,tokens_count
2019-04-01,Honestly rn. His behavior excused much it’s unacceptable I want pizza it’s late idk do.,15,"['Honestly', 'rn.', 'His', 'behavior', 'excused', 'much', 'it’s', 'unacceptable', 'I', 'want', 'pizza', 'it’s', 'late', 'idk', 'do.']",15
2019-04-02,"FAIR The boob aches. Just hanging chest make hurt baddddd Do ever think something really mean long time ago physically pain you? Actual footage me. This cute! Congrats new tat! Help, I’m depressed I’ve wanted eat week mac n cheese. I’m Libra that’s I wanna get out. Heck yes are!!! ❤️ Smort *laughs Pittsburgh* Pissy emailing fun. I got person tho lol YASSSS ❤️🧡💛💚💙💜💖 Both cute!!!",66,"['FAIR', 'The', 'boob', 'aches.', 'Just', 'hanging', 'chest', 'make', 'hurt', 'baddddd', 'Do', 'ever', 'think', 'something', 'really', 'mean', 'long', 'time', 'ago', 'physically', 'pain', 'you?', 'Actual', 'footage', 'me.', 'This', 'cute!', 'Congrats', 'new', 'tat!', 'Help,', 'I’m', 'depressed', 'I’ve', 'wanted', 'eat', 'week', 'mac', 'n', 'cheese.', 'I’m', 'Libra', 'that’s', 'I', 'wanna', 'get', 'out.', 'Heck', 'yes', 'are!!!', '❤️', 'Smort', '*laughs', 'Pittsburgh*', 'Pissy', 'emailing', 'fun.', 'I', 'got', 'person', 'tho', 'lol', 'YASSSS', '❤️🧡💛💚💙💜💖', 'Both', 'cute!!!']",66
2019-04-03,Ok! Had try :)) Are left?!?,6,"['Ok!', 'Had', 'try', ':))', 'Are', 'left?!?']",6
2019-04-04,"I love Zu integral part being, like I’m real as mom. Nah you’re close see great Foods, Rihanna, incredible shoe collection Yeah lol gotta point DnD, k-pop, cute face Bohemian style, record store, AMAZING selfie game. I running late morning, betcher as I still stopped pet teeny tiny Pomeranian puppy way bus stop. PRIORITIES. There’s song help lol I remember first 150 bc I weak. Lol tiny battle monster ClefairyArticuno TogepiHaunterElectabuzzRaichu IvysaurNidorinaExeggcute",72,"['I', 'love', 'Zu', 'integral', 'part', 'being,', 'like', 'I’m', 'real', 'as', 'mom.', 'Nah', 'you’re', 'close', 'see', 'great', 'Foods,', 'Rihanna,', 'incredible', 'shoe', 'collection', 'Yeah', 'lol', 'gotta', 'point', 'DnD,', 'k-pop,', 'cute', 'face', 'Bohemian', 'style,', 'record', 'store,', 'AMAZING', 'selfie', 'game.', 'I', 'running', 'late', 'morning,', 'betcher', 'as', 'I', 'still', 'stopped', 'pet', 'teeny', 'tiny', 'Pomeranian', 'puppy', 'way', 'bus', 'stop.', 'PRIORITIES.', 'There’s', 'song', 'help', 'lol', 'I', 'remember', 'first', '150', 'bc', 'I', 'weak.', 'Lol', 'tiny', 'battle', 'monster', 'ClefairyArticuno', 'TogepiHaunterElectabuzzRaichu', 'IvysaurNidorinaExeggcute']",72
2019-04-05,"This get bed morning tbh Lol certainly lot. Lucy lawless, Rachel weiscz, endless stream stabbing incredible woman Captain Marvel, awesome fashion, happy selfies Kim K reaction pics, reading people within inch life, Kanye I could run sweet art business home j ❤️🧡💛💚💙💜 Also iPhone parasite🦠 emoji??? Hahaha like parasite... wait",50,"['This', 'get', 'bed', 'morning', 'tbh', 'Lol', 'certainly', 'lot.', 'Lucy', 'lawless,', 'Rachel', 'weiscz,', 'endless', 'stream', 'stabbing', 'incredible', 'woman', 'Captain', 'Marvel,', 'awesome', 'fashion,', 'happy', 'selfies', 'Kim', 'K', 'reaction', 'pics,', 'reading', 'people', 'within', 'inch', 'life,', 'Kanye', 'I', 'could', 'run', 'sweet', 'art', 'business', 'home', 'j', '❤️🧡💛💚💙💜', 'Also', 'iPhone', 'parasite🦠', 'emoji???', 'Hahaha', 'like', 'parasite...', 'wait']",50
2019-04-06,"Ppl like get defensive tell watch movie. It’s like I know amount you. I want one hot toy doll real bad rn idk why. Send help, I’m gonna retail therapy death.",31,"['Ppl', 'like', 'get', 'defensive', 'tell', 'watch', 'movie.', 'It’s', 'like', 'I', 'know', 'amount', 'you.', 'I', 'want', 'one', 'hot', 'toy', 'doll', 'real', 'bad', 'rn', 'idk', 'why.', 'Send', 'help,', 'I’m', 'gonna', 'retail', 'therapy', 'death.']",31
2019-04-08,,1,[''],1
2019-04-09,NO WAY I watched Hulu special Nxium day!!!! Basically I die DRAG HER AT MY FUNERAL This bus driver hitting many curb already slammed brake hard I bashed knee seat. Kinda sad didn’t get see Joy Behar commit assault battery live tv,42,"['NO', 'WAY', 'I', 'watched', 'Hulu', 'special', 'Nxium', 'day!!!!', 'Basically', 'I', 'die', 'DRAG', 'HER', 'AT', 'MY', 'FUNERAL', 'This', 'bus', 'driver', 'hitting', 'many', 'curb', 'already', 'slammed', 'brake', 'hard', 'I', 'bashed', 'knee', 'seat.', 'Kinda', 'sad', 'didn’t', 'get', 'see', 'Joy', 'Behar', 'commit', 'assault', 'battery', 'live', 'tv']",42
2019-04-10,,1,[''],1
2019-04-11,I gross dog story tell I asked coworker wanted hear something gross bc thinking NOT THINKING THAT I JUST CAME OUT OF THE BATHROOM AND IT WOULD BE WEIRD I’m mortified want die.,33,"['I', 'gross', 'dog', 'story', 'tell', 'I', 'asked', 'coworker', 'wanted', 'hear', 'something', 'gross', 'bc', 'thinking', 'NOT', 'THINKING', 'THAT', 'I', 'JUST', 'CAME', 'OUT', 'OF', 'THE', 'BATHROOM', 'AND', 'IT', 'WOULD', 'BE', 'WEIRD', 'I’m', 'mortified', 'want', 'die.']",33
