<a href="https://colab.research.google.com/github/Vkm3221/Assignments-Association-Rules/blob/main/project_on_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import time
import string
import warnings
import spacy
from tqdm.notebook import tqdm_notebook

# for all NLP related operations on text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.classify import NaiveBayesClassifier
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

In [None]:
twitter=pd.read_csv("/content/tweet.csv")
data=twitter.copy()

In [None]:
twitter.head()

Unnamed: 0,tweets,class
0,Be aware dirty step to get money #staylight ...,figurative
1,#sarcasm for #people who don't understand #diy...,figurative
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative
3,@wilw Why do I get the feeling you like games?...,figurative
4,-@TeacherArthurG @rweingarten You probably jus...,figurative


In [None]:
twitter.dtypes

tweets    object
class     object
dtype: object

In [None]:
#check the null values
twitter.isnull().any()

tweets    False
class     False
dtype: bool

In [None]:
twitter.shape

(81408, 2)

In [None]:
#Description of dataset
twitter.describe()

Unnamed: 0,tweets,class
count,81408,81408
unique,67997,4
top,$$$=&gt;&gt; #peace #love #freedom\n#fight for...,figurative
freq,14,21238


# Text Pre-processing 

In [None]:
def remove_pattern(text, pattern_regex):
    r = re.findall(pattern_regex, text)
    for i in r:
        text = re.sub(i, '', text)
    
    return text

In [None]:
# We are keeping cleaned tweets in a new column called 'new_tweets'
twitter['new_Tweets'] = np.vectorize(remove_pattern)(twitter['tweets'], "@[\w]*")
twitter.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,Be aware dirty step to get money #staylight ...
1,#sarcasm for #people who don't understand #diy...,figurative,#sarcasm for #people who don't understand #diy...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,#DailyMail readers being sensible as always ...
3,@wilw Why do I get the feeling you like games?...,figurative,Why do I get the feeling you like games? #sar...
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,- You probably just missed the text. #sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,Tune in to Nigezie and be treated to Rachel Pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,What iz thiz?!?!? A friggin DC love fest??!?!?...
7,"man, i wish i could sexually harass an intoxic...",figurative,"man, i wish i could sexually harass an intoxic..."
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when I get to listen to you ...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,Aamir calls #BajrangiBhaijaan as Salman's best...


## Removing links (http | https)

In [None]:
cleaned_tweets = []

for index, row in twitter.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.new_Tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

twitter['new_Tweets'] = cleaned_tweets
twitter.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,Be aware dirty step to get money #staylight #s...
1,#sarcasm for #people who don't understand #diy...,figurative,#sarcasm for #people who don't understand #diy...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,#DailyMail readers being sensible as always #s...
3,@wilw Why do I get the feeling you like games?...,figurative,Why do I get the feeling you like games? #sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,- You probably just missed the text. #sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,Tune in to Nigezie and be treated to Rachel Pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,What iz thiz?!?!? A friggin DC love fest??!?!?...
7,"man, i wish i could sexually harass an intoxic...",figurative,"man, i wish i could sexually harass an intoxic..."
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when I get to listen to you a...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,Aamir calls #BajrangiBhaijaan as Salman's best...


Removing tweets with empty text

In [None]:
tweets = twitter[twitter['new_Tweets']!='']
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,Be aware dirty step to get money #staylight #s...
1,#sarcasm for #people who don't understand #diy...,figurative,#sarcasm for #people who don't understand #diy...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,#DailyMail readers being sensible as always #s...
3,@wilw Why do I get the feeling you like games?...,figurative,Why do I get the feeling you like games? #sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,- You probably just missed the text. #sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,Tune in to Nigezie and be treated to Rachel Pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,What iz thiz?!?!? A friggin DC love fest??!?!?...
7,"man, i wish i could sexually harass an intoxic...",figurative,"man, i wish i could sexually harass an intoxic..."
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when I get to listen to you a...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,Aamir calls #BajrangiBhaijaan as Salman's best...


Dropping duplicate rows

In [None]:
tweets.drop_duplicates(subset=['new_Tweets'], keep=False)
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,Be aware dirty step to get money #staylight #s...
1,#sarcasm for #people who don't understand #diy...,figurative,#sarcasm for #people who don't understand #diy...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,#DailyMail readers being sensible as always #s...
3,@wilw Why do I get the feeling you like games?...,figurative,Why do I get the feeling you like games? #sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,- You probably just missed the text. #sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,Tune in to Nigezie and be treated to Rachel Pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,What iz thiz?!?!? A friggin DC love fest??!?!?...
7,"man, i wish i could sexually harass an intoxic...",figurative,"man, i wish i could sexually harass an intoxic..."
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when I get to listen to you a...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,Aamir calls #BajrangiBhaijaan as Salman's best...


Resetting index

It seems that our index needs to be reset, since after removal of some rows, some index values are missing, which may cause problem in future operations.

In [None]:
tweets = tweets.reset_index(drop=True)
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,Be aware dirty step to get money #staylight #s...
1,#sarcasm for #people who don't understand #diy...,figurative,#sarcasm for #people who don't understand #diy...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,#DailyMail readers being sensible as always #s...
3,@wilw Why do I get the feeling you like games?...,figurative,Why do I get the feeling you like games? #sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,- You probably just missed the text. #sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,Tune in to Nigezie and be treated to Rachel Pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,What iz thiz?!?!? A friggin DC love fest??!?!?...
7,"man, i wish i could sexually harass an intoxic...",figurative,"man, i wish i could sexually harass an intoxic..."
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when I get to listen to you a...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,Aamir calls #BajrangiBhaijaan as Salman's best...


In [None]:
import string

Removing Punctuations, Numbers and Special characters



In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation.replace('#', '')), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
tweets['new_Tweets'] = tweets['new_Tweets'].apply(lambda x: clean_text(x))
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight sta...
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,dailymail readers being sensible as always sho...
3,@wilw Why do I get the feeling you like games?...,figurative,why do i get the feeling you like games sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,you probably just missed the text sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,tune in to nigezie and be treated to rachel pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,what iz thiz a friggin dc love fest sarcasm mo...
7,"man, i wish i could sexually harass an intoxic...",figurative,man i wish i could sexually harass an intoxica...
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when i get to listen to you a...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,aamir calls bajrangibhaijaan as salmans best m...


In [None]:
tweets['new_Tweets'] = tweets['new_Tweets'].apply(lambda x: clean_text(x))
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight sta...
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,dailymail readers being sensible as always sho...
3,@wilw Why do I get the feeling you like games?...,figurative,why do i get the feeling you like games sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,you probably just missed the text sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,tune in to nigezie and be treated to rachel pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,what iz thiz a friggin dc love fest sarcasm mo...
7,"man, i wish i could sexually harass an intoxic...",figurative,man i wish i could sexually harass an intoxica...
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when i get to listen to you a...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,aamir calls bajrangibhaijaan as salmans best m...


Function to remove emoji

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
tweets['new_Tweets']=tweets['new_Tweets'].apply(lambda x: remove_emoji(x))
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight sta...
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,dailymail readers being sensible as always sho...
3,@wilw Why do I get the feeling you like games?...,figurative,why do i get the feeling you like games sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,you probably just missed the text sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,tune in to nigezie and be treated to rachel pl...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,what iz thiz a friggin dc love fest sarcasm mo...
7,"man, i wish i could sexually harass an intoxic...",figurative,man i wish i could sexually harass an intoxica...
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when i get to listen to you a...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,aamir calls bajrangibhaijaan as salmans best m...


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
tweets['tokenized_tweets'] = tweets['new_Tweets'].apply(lambda x: nltk.word_tokenize(x))
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets,tokenized_tweets
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight sta...,"[be, aware, dirty, step, to, get, money, stayl..."
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...,"[sarcasm, for, people, who, dont, understand, ..."
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,dailymail readers being sensible as always sho...,"[dailymail, readers, being, sensible, as, alwa..."
3,@wilw Why do I get the feeling you like games?...,figurative,why do i get the feeling you like games sarcasm,"[why, do, i, get, the, feeling, you, like, gam..."
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,you probably just missed the text sarcastic,"[you, probably, just, missed, the, text, sarca..."
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,tune in to nigezie and be treated to rachel pl...,"[tune, in, to, nigezie, and, be, treated, to, ..."
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,what iz thiz a friggin dc love fest sarcasm mo...,"[what, iz, thiz, a, friggin, dc, love, fest, s..."
7,"man, i wish i could sexually harass an intoxic...",figurative,man i wish i could sexually harass an intoxica...,"[man, i, wish, i, could, sexually, harass, an,..."
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when i get to listen to you a...,"[for, the, car, ride, when, i, get, to, listen..."
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,aamir calls bajrangibhaijaan as salmans best m...,"[aamir, calls, bajrangibhaijaan, as, salmans, ..."


In [None]:
tweets.drop(['tokenized_tweets'],axis=1,inplace=True)

Removing Stop words

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
my_stop_words=stopwords.words('english')
#sw_list = ['i’m','\x92','rt','ye','yeah','haha','Yes','U0001F923','I','U0001F3B6','U0001F5A4']
#my_stop_words.extend(sw_list)
stopwords_set = set(my_stop_words)
cleaned_tweets = []

for index, row in tweets.iterrows():
    
    # filerting out all the stopwords 
    words_without_stopwords = [word for word in row.new_Tweets.split() if not word in stopwords_set and '#' not in word.lower()]
    
    # finally creating tweets list of tuples containing stopwords(list) and sentimentType 
    cleaned_tweets.append(' '.join(words_without_stopwords))
    
tweets['absolute_new_tweets'] = cleaned_tweets
tweets.head(10)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,tweets,class,new_Tweets,absolute_new_tweets
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight sta...,aware dirty step get money staylight staywhite...
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...,sarcasm people dont understand diy artattack
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,dailymail readers being sensible as always sho...,dailymail readers sensible always shocker sarc...
3,@wilw Why do I get the feeling you like games?...,figurative,why do i get the feeling you like games sarcasm,get feeling like games sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,you probably just missed the text sarcastic,probably missed text sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,tune in to nigezie and be treated to rachel pl...,tune nigezie treated rachel plattens fight son...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,what iz thiz a friggin dc love fest sarcasm mo...,iz thiz friggin dc love fest sarcasm moviefights
7,"man, i wish i could sexually harass an intoxic...",figurative,man i wish i could sexually harass an intoxica...,man wish could sexually harass intoxicated min...
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when i get to listen to you a...,car ride get listen jess whole time yeah woo c...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,aamir calls bajrangibhaijaan as salmans best m...,aamir calls bajrangibhaijaan salmans best movi...


Tokenize 'absolute_new_tweets'

In [None]:
tokenized_tweet = tweets['absolute_new_tweets'].apply(lambda x: x.split())
tokenized_tweet.head(10)

0    [aware, dirty, step, get, money, staylight, st...
1    [sarcasm, people, dont, understand, diy, artat...
2    [dailymail, readers, sensible, always, shocker...
3                 [get, feeling, like, games, sarcasm]
4                  [probably, missed, text, sarcastic]
5    [tune, nigezie, treated, rachel, plattens, fig...
6    [iz, thiz, friggin, dc, love, fest, sarcasm, m...
7    [man, wish, could, sexually, harass, intoxicat...
8    [car, ride, get, listen, jess, whole, time, ye...
9    [aamir, calls, bajrangibhaijaan, salmans, best...
Name: absolute_new_tweets, dtype: object

Converting words to Lemma

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
word_lemmatizer = WordNetLemmatizer()


tokenized_tweet = tokenized_tweet.apply(lambda x: [word_lemmatizer.lemmatize(i) for i in x])
tokenized_tweet.head(10)

0    [aware, dirty, step, get, money, staylight, st...
1    [sarcasm, people, dont, understand, diy, artat...
2    [dailymail, reader, sensible, always, shocker,...
3                  [get, feeling, like, game, sarcasm]
4                  [probably, missed, text, sarcastic]
5    [tune, nigezie, treated, rachel, plattens, fig...
6    [iz, thiz, friggin, dc, love, fest, sarcasm, m...
7    [man, wish, could, sexually, harass, intoxicat...
8    [car, ride, get, listen, jess, whole, time, ye...
9    [aamir, call, bajrangibhaijaan, salmans, best,...
Name: absolute_new_tweets, dtype: object

Joining all tokens into sentences

In [None]:
for i, tokens in enumerate(tokenized_tweet):
    tokenized_tweet[i] = ' '.join(tokens)

tweets['absolute_new_tweets'] = tokenized_tweet
tweets.head(10)

Unnamed: 0,tweets,class,new_Tweets,absolute_new_tweets
0,Be aware dirty step to get money #staylight ...,figurative,be aware dirty step to get money staylight sta...,aware dirty step get money staylight staywhite...
1,#sarcasm for #people who don't understand #diy...,figurative,sarcasm for people who dont understand diy art...,sarcasm people dont understand diy artattack
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative,dailymail readers being sensible as always sho...,dailymail reader sensible always shocker sarca...
3,@wilw Why do I get the feeling you like games?...,figurative,why do i get the feeling you like games sarcasm,get feeling like game sarcasm
4,-@TeacherArthurG @rweingarten You probably jus...,figurative,you probably just missed the text sarcastic,probably missed text sarcastic
5,Tune in to Nigezie and be treated to Rachel Pl...,figurative,tune in to nigezie and be treated to rachel pl...,tune nigezie treated rachel plattens fight son...
6,What iz thiz?!?!? A friggin DC love fest??!?!?...,figurative,what iz thiz a friggin dc love fest sarcasm mo...,iz thiz friggin dc love fest sarcasm moviefights
7,"man, i wish i could sexually harass an intoxic...",figurative,man i wish i could sexually harass an intoxica...,man wish could sexually harass intoxicated min...
8,@raaachf for the car ride when I get to listen...,figurative,for the car ride when i get to listen to you a...,car ride get listen jess whole time yeah woo c...
9,Aamir calls #BajrangiBhaijaan as Salman's best...,figurative,aamir calls bajrangibhaijaan as salmans best m...,aamir call bajrangibhaijaan salmans best movie...


In [None]:
df=tweets[['absolute_new_tweets','class']]

In [None]:
df

Unnamed: 0,absolute_new_tweets,class
0,aware dirty step get money staylight staywhite...,figurative
1,sarcasm people dont understand diy artattack,figurative
2,dailymail reader sensible always shocker sarca...,figurative
3,get feeling like game sarcasm,figurative
4,probably missed text sarcastic,figurative
...,...,...
81398,photo image via heart childhood cool funny sar...,sarcasm
81399,never knewi better put universe lolmaybe there...,sarcasm
81400,hey wanted say thanks puberty letting apart it...,sarcasm
81401,im sure coverage like fox news special “the hi...,sarcasm


In [None]:
df = df.rename(columns = {"absolute_new_tweets": "tweets"})

In [None]:
df

Unnamed: 0,tweets,class
0,aware dirty step get money staylight staywhite...,figurative
1,sarcasm people dont understand diy artattack,figurative
2,dailymail reader sensible always shocker sarca...,figurative
3,get feeling like game sarcasm,figurative
4,probably missed text sarcastic,figurative
...,...,...
81398,photo image via heart childhood cool funny sar...,sarcasm
81399,never knewi better put universe lolmaybe there...,sarcasm
81400,hey wanted say thanks puberty letting apart it...,sarcasm
81401,im sure coverage like fox news special “the hi...,sarcasm


In [None]:
df['class'].unique()

array(['figurative', 'irony', 'regular', 'sarcasm'], dtype=object)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81403 entries, 0 to 81402
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  81403 non-null  object
 1   class   81403 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [None]:
# Separating Figurative classified data
tweets_figurative=df[df['class']=='figurative']
figurative = " ".join(tweets_figurative['tweets'].fillna('').astype(str))

In [None]:
figurative



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize

# Tokenize the document into words
words = word_tokenize(figurative)


In [None]:
words

['aware',
 'dirty',
 'step',
 'get',
 'money',
 'staylight',
 'staywhite',
 'sarcastic',
 'moralneeded',
 '…',
 'sarcasm',
 'people',
 'dont',
 'understand',
 'diy',
 'artattack',
 'dailymail',
 'reader',
 'sensible',
 'always',
 'shocker',
 'sarcastic',
 'dailyfail',
 'inhuntspocket',
 'theyhatethenhs',
 'get',
 'feeling',
 'like',
 'game',
 'sarcasm',
 'probably',
 'missed',
 'text',
 'sarcastic',
 'tune',
 'nigezie',
 'treated',
 'rachel',
 'plattens',
 'fight',
 'song',
 'move',
 'mtv',
 'get',
 'reminisce',
 'amp',
 'olamides',
 'local',
 'rapper',
 'irony',
 'timeschange',
 'iz',
 'thiz',
 'friggin',
 'dc',
 'love',
 'fest',
 'sarcasm',
 'moviefights',
 'man',
 'wish',
 'could',
 'sexually',
 'harass',
 'intoxicated',
 'minor',
 'go',
 'jail',
 'sarcasm',
 'car',
 'ride',
 'get',
 'listen',
 'jess',
 'whole',
 'time',
 'yeah',
 'woo',
 'cant',
 'wait',
 'either',
 'sarcasm',
 'aamir',
 'call',
 'bajrangibhaijaan',
 'salmans',
 'best',
 'movie',
 'yet',
 'decide',
 'whether',
 'sa

In [None]:
 # Create a dataframe from the list of words
df1 = pd.DataFrame(words, columns=['Word'])

In [None]:
df1.head()

Unnamed: 0,Word
0,aware
1,dirty
2,step
3,get
4,money


In [None]:
freq_figurative = pd.Series(' '.join(df1['Word']).split()).value_counts()

In [None]:
freq_figurative

sarcasm               10207
irony                  7977
ironic                 2904
im                     1061
like                    996
                      ...  
fcblive                   1
fcbathletic               1
pique                     1
wherearetheparents        1
quinlan                   1
Length: 27964, dtype: int64

In [None]:
freq_top_100_figurative=freq_figurative[0:100]

In [None]:
freq_top_100_figurative

sarcasm     10207
irony        7977
ironic       2904
im           1061
like          996
            ...  
made          202
play          201
bad           194
watching      193
said          193
Length: 100, dtype: int64

In [None]:
# Save as dataframe freq_figurative and freq_top_100_figurative
freq_top_100_figurative.to_csv(r'C:\Users\Vikram\Desktop\Excelr Project\freq_top_100_figurative.csv')

In [None]:
freq_figurative.to_csv(r'C:\Users\Vikram\Desktop\Excelr Project\freq_figurative.csv')

In [None]:
df_figurative=freq_figurative.reset_index()
df_figurative.columns=['figurative_words','figurative_count']

In [None]:
df_figurative

Unnamed: 0,figurative_words,figurative_count
0,sarcasm,10207
1,irony,7977
2,ironic,2904
3,im,1061
4,like,996
...,...,...
27959,fcblive,1
27960,fcbathletic,1
27961,pique,1
27962,wherearetheparents,1


In [None]:
df_figurative

Unnamed: 0,figurative_words,figurative_count
0,sarcasm,10207
1,irony,7977
2,ironic,2904
3,im,1061
4,like,996
...,...,...
27959,fcblive,1
27960,fcbathletic,1
27961,pique,1
27962,wherearetheparents,1


In [None]:
df_figurative

Unnamed: 0,figurative_words,figurative_count
0,sarcasm,10207
1,irony,7977
2,ironic,2904
3,im,1061
4,like,996
...,...,...
27959,fcblive,1
27960,fcbathletic,1
27961,pique,1
27962,wherearetheparents,1


In [None]:
# Separating Irony classified data
tweets_irony=df[df['class']=='irony']
irony = " ".join(tweets_irony['tweets'].fillna('').astype(str))

words_irony = word_tokenize(irony)
df2 = pd.DataFrame(words_irony, columns=['Word'])
freq_irony = pd.Series(' '.join(df2['Word']).split()).value_counts()
df_irony=freq_irony.reset_index()
df_irony.columns=['irony_words','irony_count']
df_irony

Unnamed: 0,irony_words,irony_count
0,irony,16151
1,ironic,5324
2,people,1118
3,amp,983
4,like,898
...,...,...
28917,elaineisattentionseeking,1
28918,youvechanged,1
28919,matas,1
28920,thicker,1


In [None]:
# Separating Regular classified data
tweets_regular=df[df['class']=='regular']
regular = " ".join(tweets_regular['tweets'].fillna('').astype(str))

words_regular = word_tokenize(regular)
df3 = pd.DataFrame(words_regular, columns=['Word'])
freq_regular = pd.Series(' '.join(df3['Word']).split()).value_counts()
df_regular=freq_regular.reset_index()
df_regular.columns=['regular_words','regular_count']
df_regular

Unnamed: 0,regular_words,regular_count
0,peace,2973
1,news,2933
2,drug,2813
3,education,2764
4,late,2754
...,...,...
26875,sorrowful,1
26876,medicaldevices,1
26877,industryfunded,1
26878,ludwig,1


In [None]:
# Separating Sarcasm classified data
tweets_sarcasm=df[df['class']=='sarcasm']
sarcasm = " ".join(tweets_sarcasm['tweets'].fillna('').astype(str))

words_sarcasm = word_tokenize(sarcasm)
df4 = pd.DataFrame(words_sarcasm, columns=['Word'])
freq_sarcasm = pd.Series(' '.join(df4['Word']).split()).value_counts()
df_sarcasm=freq_sarcasm.reset_index()
df_sarcasm.columns=['sarcasm_words','sarcasm_count']
df_sarcasm

Unnamed: 0,sarcasm_words,sarcasm_count
0,sarcasm,19770
1,sarcastic,1244
2,im,1209
3,love,1207
4,like,1091
...,...,...
24959,reactivating,1
24960,whatyearisit,1
24961,beliebers,1
24962,sheknowsheaintright,1


In [None]:
merged_df = pd.merge(df_figurative, df_irony, left_on='figurative_words', right_on='irony_words', how='outer')

In [None]:
merged_df = pd.merge(merged_df, df_regular, left_on='figurative_words', right_on='regular_words', how='outer')
merged_df = pd.merge(merged_df, df_sarcasm, left_on='figurative_words', right_on='sarcasm_words', how='outer')

In [None]:
merged_df

Unnamed: 0,figurative_words,figurative_count,irony_words,irony_count,regular_words,regular_count,sarcasm_words,sarcasm_count
0,sarcasm,10207.0,sarcasm,12.0,,,sarcasm,19770.0
1,irony,7977.0,irony,16151.0,irony,6.0,irony,9.0
2,ironic,2904.0,ironic,5324.0,ironic,2.0,ironic,7.0
3,im,1061.0,im,845.0,im,301.0,im,1209.0
4,like,996.0,like,898.0,like,417.0,like,1091.0
...,...,...,...,...,...,...,...,...
65260,,,,,,,hallelujahgibbs,1.0
65261,,,,,,,whatyearisit,1.0
65262,,,,,,,beliebers,1.0
65263,,,,,,,sheknowsheaintright,1.0


In [None]:
merged_df.to_csv(r'C:\Users\Vikram\Desktop\Excelr Project\merged_df.csv')

In [None]:

df_filled = merged_df.copy()


In [None]:
df_filled

Unnamed: 0,figurative_words,figurative_count,irony_words,irony_count,regular_words,regular_count,sarcasm_words,sarcasm_count
0,sarcasm,10207.0,sarcasm,12.0,,,sarcasm,19770.0
1,irony,7977.0,irony,16151.0,irony,6.0,irony,9.0
2,ironic,2904.0,ironic,5324.0,ironic,2.0,ironic,7.0
3,im,1061.0,im,845.0,im,301.0,im,1209.0
4,like,996.0,like,898.0,like,417.0,like,1091.0
...,...,...,...,...,...,...,...,...
65260,,,,,,,hallelujahgibbs,1.0
65261,,,,,,,whatyearisit,1.0
65262,,,,,,,beliebers,1.0
65263,,,,,,,sheknowsheaintright,1.0


df_filled.to_csv(r'C:\Users\aravi\OneDrive\Desktop\NLP Project\Word Analysis\df_merged.csv')

In [None]:
df_filled['word'] = pd.concat([df_filled['figurative_words'], df_filled['irony_words'], df_filled['regular_words'], df_filled['sarcasm_words']], ignore_index=True)

# Drop the original word columns
df_filled.drop(columns=['figurative_words', 'irony_words', 'regular_words', 'sarcasm_words'], inplace=True)
df_concat=df_filled.copy()
# Print the resulting DataFrame
print(df_concat)

       figurative_count  irony_count  regular_count  sarcasm_count     word
0               10207.0         12.0            NaN        19770.0  sarcasm
1                7977.0      16151.0            6.0            9.0    irony
2                2904.0       5324.0            2.0            7.0   ironic
3                1061.0        845.0          301.0         1209.0       im
4                 996.0        898.0          417.0         1091.0     like
...                 ...          ...            ...            ...      ...
65260               NaN          NaN            NaN            1.0      NaN
65261               NaN          NaN            NaN            1.0      NaN
65262               NaN          NaN            NaN            1.0      NaN
65263               NaN          NaN            NaN            1.0      NaN
65264               NaN          NaN            NaN            1.0      NaN

[65265 rows x 5 columns]


In [None]:
df_concat.to_csv(r'C:\Users\Vikram\Desktop\Excelr Project\df_concat.csv')

In [None]:
top_100_regular=df_regular.iloc[0:100, :]
top_100_figurative=df_figurative.iloc[0:100, : ]
top_100_irony=df_irony.iloc[0:100, : ]
top_100_sarcasm=df_sarcasm.iloc[0:100, : ]

In [None]:
top_100_figurative.info()
top_100_irony.info()
top_100_regular.info()
top_100_sarcasm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   figurative_words  100 non-null    object
 1   figurative_count  100 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   irony_words  100 non-null    object
 1   irony_count  100 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   regular_words  100 non-null    object
 1   regular_count  100 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB
<class 'pandas.core.f

In [None]:
merged_df_100 = pd.merge(top_100_figurative, top_100_irony, left_on='figurative_words', right_on='irony_words', how='outer')

In [None]:
merged_df_100 = pd.merge(merged_df_100, top_100_regular, left_on='figurative_words', right_on='regular_words', how='outer')
merged_df_100 = pd.merge(merged_df_100, top_100_sarcasm, left_on='figurative_words', right_on='sarcasm_words', how='outer')

In [None]:
merged_df_100

Unnamed: 0,figurative_words,figurative_count,irony_words,irony_count,regular_words,regular_count,sarcasm_words,sarcasm_count
0,sarcasm,10207.0,,,,,sarcasm,19770.0
1,irony,7977.0,irony,16151.0,,,,
2,ironic,2904.0,ironic,5324.0,,,,
3,im,1061.0,im,845.0,im,301.0,im,1209.0
4,like,996.0,like,898.0,like,417.0,like,1091.0
...,...,...,...,...,...,...,...,...
182,,,,,,,school,213.0
183,,,,,,,something,212.0
184,,,,,,,everyone,210.0
185,,,,,,,na,204.0


In [None]:
merged_df_100.to_csv(r'C:\Users\Vikram\Desktop\Excelr Project\merged_df_100.csv')

In [None]:
# Specify columns and their respective fill values
fill_values = {'figurative_words': '-', 'figurative_count': 0, 'irony_words': '-', 'irony_count' : 0, 'regular_words': '-', 
               'regular_count': 0, 'sarcasm_words': '-', 'sarcasm_count':0 }

# Fill specific columns with the specified fill values
df_filled_100 = merged_df_100.fillna(value=fill_values)


In [None]:
df_filled_100

Unnamed: 0,figurative_words,figurative_count,irony_words,irony_count,regular_words,regular_count,sarcasm_words,sarcasm_count
0,sarcasm,10207.0,-,0.0,-,0.0,sarcasm,19770.0
1,irony,7977.0,irony,16151.0,-,0.0,-,0.0
2,ironic,2904.0,ironic,5324.0,-,0.0,-,0.0
3,im,1061.0,im,845.0,im,301.0,im,1209.0
4,like,996.0,like,898.0,like,417.0,like,1091.0
...,...,...,...,...,...,...,...,...
182,-,0.0,-,0.0,-,0.0,school,213.0
183,-,0.0,-,0.0,-,0.0,something,212.0
184,-,0.0,-,0.0,-,0.0,everyone,210.0
185,-,0.0,-,0.0,-,0.0,na,204.0


In [None]:
df_filled_100.to_csv(r'C:\Users\Vikram\Desktop\Excelr Project\df_filled_100.csv')

CountVectorizer with Bigrams & Trigrams

In [None]:
from nltk.corpus import stopwords
ps = PorterStemmer()
corpus = []
for i in tqdm_notebook(range(0, len(df))):
    review = re.sub('[^a-zA-Z]', ' ', df['tweets'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

  0%|          | 0/81403 [00:00<?, ?it/s]

In [None]:
corpus[3]

'get feel like game sarcasm'

In [None]:
## Applying Countvectorizer
# Creating the Bag of Words model
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()

In [None]:
X.shape

(81403, 5000)

In [None]:
cv.get_feature_names_out()[:20]

array(['aa', 'ab', 'abandon', 'abbott', 'abc', 'abil', 'abl', 'abort',
       'absolut', 'absolut love', 'abt', 'abus', 'ac', 'academ', 'accept',
       'access', 'accessori', 'accessori shop', 'accid', 'accident'],
      dtype=object)

In [None]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [None]:
count_df = pd.DataFrame(X, columns=cv.get_feature_names_out())
count_df

Unnamed: 0,aa,ab,abandon,abbott,abc,abil,abl,abort,absolut,absolut love,...,yr,yr old,yummi,yummi drug,yummi drug pain,yup,zazzl,zero,zombi,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
tweets=[Text.strip() for Text in df.tweets]
tweets=[Text for Text in df if Text] 
tweets[0:10]

['tweets', 'class']

In [None]:
pip install --upgrade scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
tweetscv=cv.fit_transform(words)

In [None]:
print(cv.get_feature_names_out())

['aa' 'aaa' 'aaaaaand' ... 'जल' 'नब' 'मह']
