In [23]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [24]:
train  = pd.read_csv('train_E6oV3lV.csv')
test = pd.read_csv('test_tweets_anuFYb8.csv')

In [25]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


Combining test and train data

In [26]:
totalData = train.append(test, ignore_index=True)

In [27]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

Removing twitter handlers i.e. @users from totalData

In [28]:
totalData['clean_tweet'] = np.vectorize(remove_pattern)(totalData['tweet'], "@[\w]*")

In [29]:
totalData.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


Remving numbers, characters except '#'

In [30]:
totalData['clean_tweet'] = totalData['clean_tweet'].str.replace("[^a-zA-Z#]", " ")

In [31]:
totalData.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide society now #motivation


Removing words less than 3 chars

In [32]:
totalData['clean_tweet']= totalData['clean_tweet'] .apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [33]:
totalData.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause they offer wheelchai...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation


Tokanization

In [34]:
tokenized_tweet = totalData['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: clean_tweet, dtype: object

Stremming
i.e removing ing, ly suffixes from the word as meaning does not change 

In [35]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, caus, they, offer, whee...
2                              [bihday, your, majesti]
3                     [#model, love, take, with, time]
4                         [factsguid, societi, #motiv]
Name: clean_tweet, dtype: object

Stich them back together

In [36]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

totalData['clean_tweet'] = tokenized_tweet
tokenized_tweet.head()

0    when father dysfunct selfish drag kid into dys...
1    thank #lyft credit caus they offer wheelchair ...
2                                  bihday your majesti
3                           #model love take with time
4                             factsguid societi #motiv
Name: clean_tweet, dtype: object

Now we have completed the data cleaning process lets begin Data visualzation

Draw wordplot with frequnecy of words in combine dataset

In [37]:
all_words = ' '.join([text for text in totalData['clean_tweet']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

ImportError: DLL load failed: The specified module could not be found.

Try this with positive tweets

In [None]:
positive_words = ' '.join([text for text in totalData['clean_tweet'][totalData['label'] == 0]])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(positive_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

Try this with negative/racist tweets

In [None]:
negative_words = ' '.join([text for text in totalData['clean_tweet'][totalData['label'] == 1]])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negative_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
tokenized_tweet = totalData['clean_tweet'].apply(lambda x: x.split())
filtered_sentence = tokenized_tweet.apply(lambda x: [i for i in x if not i in stop_words])

# [w for w in tokenized_tweet if not w in stop_words]


# for index, tweet in enumerate(tokenized_tweet):
#     for w in tweet:
#         if not w in stop_words:
#             tokenized_tweet[index].append(w)
            
    
filtered_sentence.head()

In [None]:
for i in range(len(filtered_sentence)):
    filtered_sentence[i] = ' '.join(filtered_sentence[i])

totalData['clean_tweet'] = filtered_sentence
totalData.head()