In [1]:
import pandas as pd 
import nltk
import re
import string
from nltk.corpus import stopwords
import datetime
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("twitter_sentiments.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df.shape

(31962, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [5]:
df["label"].unique()

array([0, 1], dtype=int64)

# Normalize

In [6]:
df["tweet"] = df["tweet"].str.lower()
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


# Remove HTML Tags

In [8]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r"", text)

df['tweet'] = df['tweet'].apply(remove_html_tags)

# Remove url

In [9]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['tweet'] = df['tweet'].apply(remove_url)

# Remove punctuation

In [10]:
exclude = string.punctuation
def remove_punct(text):
    return text.translate(str.maketrans("", "", exclude))

df['tweet'] = df['tweet'].apply(remove_punct)

In [11]:
df[["tweet"]]

Unnamed: 0,tweet
0,user when a father is dysfunctional and is so...
1,user user thanks for lyft credit i cant use ca...
2,bihday your majesty
3,model i love u take with u all the time in u...
4,factsguide society now motivation
...,...
31957,ate user isz that youuuðððððð...
31958,to see nina turner on the airwaves trying to...
31959,listening to sad songs on a monday morning otw...
31960,user sikh temple vandalised in in calgary wso ...


# Emojy handling

In [12]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df["tweet"] = df["tweet"].apply(remove_emoji)

In [13]:
df[["tweet"]]

Unnamed: 0,tweet
0,user when a father is dysfunctional and is so...
1,user user thanks for lyft credit i cant use ca...
2,bihday your majesty
3,model i love u take with u all the time in u...
4,factsguide society now motivation
...,...
31957,ate user isz that youuuðððððð...
31958,to see nina turner on the airwaves trying to...
31959,listening to sad songs on a monday morning otw...
31960,user sikh temple vandalised in in calgary wso ...


# Remove Stop words

In [14]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

# Capture start time
start_time = datetime.datetime.now()

# remove stop words
df["tweet"] = df["tweet"].apply(remove_stopwords)

# Capture end time
end_time = datetime.datetime.now()

time_diff = (end_time - start_time).total_seconds() / 60.0

print("Time difference in minutes:", time_diff)

Time difference in minutes: 1.4849942666666667


# Stemmer

In [15]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

# Capture start time
start_time = datetime.datetime.now()

# Do some processing here...
df["tweet"] = df["tweet"].apply(stem_words)

# Capture end time
end_time = datetime.datetime.now()

time_diff = (end_time - start_time).total_seconds() / 60.0

print("Time difference in minutes:", time_diff)


Time difference in minutes: 0.07805933333333333


# Tokenization

In [16]:
# Capture start time
start_time = datetime.datetime.now()

# Do some processing here...
df["tweet_tokens"] = df["tweet"].apply( word_tokenize )

# Capture end time
end_time = datetime.datetime.now()

time_diff = (end_time - start_time).total_seconds() / 60.0

print("Time difference in minutes:", time_diff)


Time difference in minutes: 0.04097858333333334


In [17]:
df.head()

Unnamed: 0,id,label,tweet,tweet_tokens
0,1,0,user father dysfunct selfish drag kid dysfunct...,"[user, father, dysfunct, selfish, drag, kid, d..."
1,2,0,user user thank lyft credit cant use caus dont...,"[user, user, thank, lyft, credit, cant, use, c..."
2,3,0,bihday majesti,"[bihday, majesti]"
3,4,0,model love u take u time urð± ðððð...,"[model, love, u, take, u, time, urð±, ðð..."
4,5,0,factsguid societi motiv,"[factsguid, societi, motiv]"


# Lemmatization

In [18]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]   

# Capture start time
start_time = datetime.datetime.now()

# Do some processing here...
df["lemma"] = df["tweet_tokens"].apply(lemmatize_tokens)

# Capture end time
end_time = datetime.datetime.now()

time_diff = (end_time - start_time).total_seconds() / 60.0

print("Time difference in minutes:", time_diff)

Time difference in minutes: 0.041943966666666666


In [19]:
df.head()

Unnamed: 0,id,label,tweet,tweet_tokens,lemma
0,1,0,user father dysfunct selfish drag kid dysfunct...,"[user, father, dysfunct, selfish, drag, kid, d...","[user, father, dysfunct, selfish, drag, kid, d..."
1,2,0,user user thank lyft credit cant use caus dont...,"[user, user, thank, lyft, credit, cant, use, c...","[user, user, thank, lyft, credit, cant, use, c..."
2,3,0,bihday majesti,"[bihday, majesti]","[bihday, majesti]"
3,4,0,model love u take u time urð± ðððð...,"[model, love, u, take, u, time, urð±, ðð...","[model, love, u, take, u, time, urð±, ðð..."
4,5,0,factsguid societi motiv,"[factsguid, societi, motiv]","[factsguid, societi, motiv]"


In [20]:
def join_lemma(tokens):
    return " ".join(tokens)
df["tweet"] = df["lemma"].apply(join_lemma)

In [21]:
df.head()

Unnamed: 0,id,label,tweet,tweet_tokens,lemma
0,1,0,user father dysfunct selfish drag kid dysfunct...,"[user, father, dysfunct, selfish, drag, kid, d...","[user, father, dysfunct, selfish, drag, kid, d..."
1,2,0,user user thank lyft credit cant use caus dont...,"[user, user, thank, lyft, credit, cant, use, c...","[user, user, thank, lyft, credit, cant, use, c..."
2,3,0,bihday majesti,"[bihday, majesti]","[bihday, majesti]"
3,4,0,model love u take u time urð± ðððð...,"[model, love, u, take, u, time, urð±, ðð...","[model, love, u, take, u, time, urð±, ðð..."
4,5,0,factsguid societi motiv,"[factsguid, societi, motiv]","[factsguid, societi, motiv]"


# Bag of words

In [22]:
cv = CountVectorizer()

In [23]:
bow = cv.fit_transform(df['tweet'])

In [24]:
#vocabulary
print(cv.vocabulary_)

{'user': 34604, 'father': 11799, 'dysfunct': 10399, 'selfish': 29125, 'drag': 10107, 'kid': 18377, 'run': 28332, 'thank': 32559, 'lyft': 20331, 'credit': 8312, 'cant': 6312, 'use': 34597, 'caus': 6584, 'dont': 9958, 'offer': 23992, 'wheelchair': 35875, 'van': 34715, 'pdx': 25106, 'disapoint': 9611, 'getthank': 13612, 'bihday': 4650, 'majesti': 20526, 'model': 21782, 'love': 19949, 'take': 32068, 'time': 33158, 'urð': 34576, 'factsguid': 11576, 'societi': 30341, 'motiv': 22083, '22': 642, 'huge': 16069, 'fan': 11685, 'fare': 11717, 'big': 4590, 'talk': 32098, 'leav': 19085, 'chao': 6801, 'pay': 25045, 'disput': 9700, 'get': 13583, 'allshowandnogo': 2234, 'camp': 6230, 'tomorrow': 33390, 'dannyâ': 8820, 'next': 23163, 'school': 28844, 'year': 36837, 'examsð': 11342, 'think': 32946, 'exam': 11338, 'hate': 15082, 'imagin': 16553, 'actorslif': 1659, 'revolutionschool': 27819, 'girl': 13728, 'land': 18841, 'allin': 2204, 'cav': 6589, 'champion': 6764, 'cleveland': 7338, 'clevelandcavali': 73

# N-Gram

In [25]:
cv = CountVectorizer(ngram_range=(2,2))

In [26]:
bow = cv.fit_transform(df['tweet'])

In [27]:
#vocabulary
print(cv.vocabulary_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

