In [2]:
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

path = '/Users/thaveesha/Developer/nltk_data'

# nltk.download('stopwords', download_dir=path)
# nltk.download('wordnet', download_dir=path)
# nltk.download('punkt', download_dir=path)

In [3]:
df = pd.read_csv('../datasets/kaggle_datasets/depression-tweets.csv', encoding='latin-1')
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
df.columns

Index(['target', 'ids', 'date', 'flag', 'user', 'text'], dtype='object')

In [5]:
df.dtypes

target     int64
ids        int64
date      object
flag      object
user      object
text      object
dtype: object

In [6]:
df.shape

(1599999, 6)

In [7]:
df.target.unique()

array([0, 4])

In [8]:
df = df.drop(['ids', 'date', 'flag', 'user'], axis = 'columns')
df.head()

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [9]:
# swap the position of the columns
df = df.loc[:, ['text', 'target']]
df.head()

Unnamed: 0,text,target
0,is upset that he can't update his Facebook by ...,0
1,@Kenichan I dived many times for the ball. Man...,0
2,my whole body feels itchy and like its on fire,0
3,"@nationwideclass no, it's not behaving at all....",0
4,@Kwesidei not the whole crew,0


In [10]:
df['target'] = df['target'].replace(4, 1)
df.tail()

Unnamed: 0,text,target
1599994,Just woke up. Having no school is the best fee...,1
1599995,TheWDB.com - Very cool to hear old Walt interv...,1
1599996,Are you ready for your MoJo Makeover? Ask me f...,1
1599997,Happy 38th Birthday to my boo of alll time!!! ...,1
1599998,happy #charitytuesday @theNSPCC @SparksCharity...,1


In [11]:
df.nunique()

text      1581465
target          2
dtype: int64

In [12]:
df['target'].unique()

array([0, 1])

In [13]:
def remove_punctuation(text):
    # check if the input is a string
    if isinstance(text, str):
        # create a translation table with punctuation marks mapped to None
        translator = str.maketrans('', '', string.punctuation)
        # remove punctuation using the translation table
        return text.translate(translator)
    else:
        # if the input is not a string, return the input unchanged
        return text

# apply the remove_punctuation function to the 'A' column
df['text'] = df['text'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,text,target
0,is upset that he cant update his Facebook by t...,0
1,Kenichan I dived many times for the ball Manag...,0
2,my whole body feels itchy and like its on fire,0
3,nationwideclass no its not behaving at all im ...,0
4,Kwesidei not the whole crew,0


In [14]:
nltk.data.path.append('/Users/thaveesha/Developer/nltk_data')

# Tokenize sentences into words
df['text'] = df['text'].apply(lambda x: word_tokenize(x.lower()))

# Remove stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

df

Unnamed: 0,text,target
0,"[upset, cant, update, facebook, texting, might...",0
1,"[kenichan, dived, many, times, ball, managed, ...",0
2,"[whole, body, feels, itchy, like, fire]",0
3,"[nationwideclass, behaving, im, mad, cant, see]",0
4,"[kwesidei, whole, crew]",0
...,...,...
1599994,"[woke, school, best, feeling, ever]",1
1599995,"[thewdbcom, cool, hear, old, walt, interviews,...",1
1599996,"[ready, mojo, makeover, ask, details]",1
1599997,"[happy, 38th, birthday, boo, alll, time, tupac...",1


In [15]:
df.to_csv('../datasets/cleaned_datasets/cleaned_twitter_dataset_without_lemmatization.csv', header=True, encoding='utf-8', index=False)
df = pd.read_csv('../datasets/cleaned_datasets/cleaned_twitter_dataset_without_lemmatization.csv')
df

Unnamed: 0,text,target
0,"['upset', 'cant', 'update', 'facebook', 'texti...",0
1,"['kenichan', 'dived', 'many', 'times', 'ball',...",0
2,"['whole', 'body', 'feels', 'itchy', 'like', 'f...",0
3,"['nationwideclass', 'behaving', 'im', 'mad', '...",0
4,"['kwesidei', 'whole', 'crew']",0
...,...,...
1599994,"['woke', 'school', 'best', 'feeling', 'ever']",1
1599995,"['thewdbcom', 'cool', 'hear', 'old', 'walt', '...",1
1599996,"['ready', 'mojo', 'makeover', 'ask', 'details']",1
1599997,"['happy', '38th', 'birthday', 'boo', 'alll', '...",1


In [None]:
# Apply Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

df.tail()

In [15]:
df.to_csv('../datasets/cleaned_datasets/cleaned_twitter_dataset.csv', header=True, encoding='utf-8', index=False)
df = pd.read_csv('../datasets/cleaned_datasets/cleaned_twitter_dataset.csv')
df

Unnamed: 0,text,target
0,"['upset', 'cant', 'update', 'facebook', 'texti...",0
1,"['kenichan', 'dived', 'many', 'time', 'ball', ...",0
2,"['whole', 'body', 'feel', 'itchy', 'like', 'fi...",0
3,"['nationwideclass', 'behaving', 'im', 'mad', '...",0
4,"['kwesidei', 'whole', 'crew']",0
...,...,...
1599994,"['woke', 'school', 'best', 'feeling', 'ever']",1
1599995,"['thewdbcom', 'cool', 'hear', 'old', 'walt', '...",1
1599996,"['ready', 'mojo', 'makeover', 'ask', 'detail']",1
1599997,"['happy', '38th', 'birthday', 'boo', 'alll', '...",1
