In [57]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize

In [58]:
import My_Preprocessing as prep_funct

#prep_funct.remove_Sarcasm_hashtag()

dataset = pd.read_json("cleaned_#sarcasm.json")

print(dataset.count())
sarcastic_counts = dataset['isSarcastic'].value_counts()

print()
print()

# Display the result
print("Number of rows for each value in the 'isSarcastic' column:")
print(sarcastic_counts)

isSarcastic    39780
text           39780
dtype: int64


Number of rows for each value in the 'isSarcastic' column:
isSarcastic
0    21292
1    18488
Name: count, dtype: int64


In [47]:
#preprocessinf undersampling

dataset = prep_funct.undersampling(dataset)

class_counts = dataset['isSarcastic'].value_counts()

print("Number of rows for each class:")
print(class_counts)

Number of rows for each class:
isSarcastic
0    18488
1    18488
Name: count, dtype: int64


In [59]:
dataset = prep_funct.random_oversampling(dataset)

class_counts = dataset['isSarcastic'].value_counts()

print("Number of rows for each class:")
print(class_counts)

Number of rows for each class:
isSarcastic
0    21292
1    21292
Name: count, dtype: int64


In [48]:
import pandas as pd
dataset_shuffled = dataset.sample(frac=1, random_state=42)

data_sample = len(dataset) // 2
dataset = dataset_shuffled.iloc[:data_sample]


total_rows = len(dataset)
print(total_rows)

18488


In [49]:
class_counts = dataset['isSarcastic'].value_counts()

# Display the number of rows for each class
print("Number of rows for each class:")
print(class_counts)

Number of rows for each class:
isSarcastic
0    9309
1    9179
Name: count, dtype: int64


In [50]:
print(dataset.head(10))
print(dataset.tail(10))

       isSarcastic                                               text
2649             1  Don't get me wrong I'm not upset about my hair...
465              0                  @sami_antha omg that's attractive
34822            0  i cant believe tomorrow is my last day off wor...
17786            0  Your sapiosexual girlfriend gets angry when yo...
8687             1  The trick to farting in an elevator is wearing...
15731            1  Can't wait to go to work and cough all over su...
21915            1  my mom is literally so scared of waking me up ...
12079            0  Camouflage Elastic Band Rain Sun Umbrella Hat ...
19240            1  The day after thanks giving, parents are right...
9985             0  My mum and dad got one of they tassimo coffee ...
       isSarcastic                                               text
23687            0  @CBSNews @mike_pence @realDonaldTrump It will ...
34519            0  But it was Saturday night, I guess that it mak...
9229             0  

In [33]:
#############################################################
#preprocessing

In [60]:
import text_mining_utils as tmu

In [61]:
#basic cleaning - lab7

clean_operations = {
r'(\(.+?\))+' : '', ## paranthetical notes to be replaced by empty string
r'(\[.+?\])+' : '', ## numbered citations to be replaced by empty string
r'\s+' : ' ', ## any type of white space to be replaced with a single white space
r'\s{2,}' : ' ', ## 2 or more consecutive white spaces to be replaced with a single white spac
}

clean_data = dataset.copy()

clean_data.text = clean_data.text.apply(tmu.clean_doc, clean_operations=clean_operations)
clean_data.head()

Unnamed: 0,text,isSarcastic
0,@0430yes i hope youre lurking rn. i want to li...,0
1,05 really taught me a valuable lesson I'm neve...,0
2,"@098BERRY Never had a voice to protest, so you...",0
3,@0hMySt4rs Rest in peace & love to you and you...,0
4,100 days until Christmas! 🌲 #too soon #not rea...,0


In [62]:
lower_docs = dataset.copy()
lower_docs.text = lower_docs.text.apply(str.lower)

In [10]:
import nltk
import re

#parts of speech tags you want to remove
tags_to_remove = ['NN', 'VB', 'ADJ']

removeTags = lower_docs.copy()

removeTags['text'] = removeTags['text'].apply(lambda x: tmu.remove_terms_by_POS(x, tags_to_remove))


In [63]:
contradictions = lower_docs.copy()

#define dictionary, couldn't find library for this
contractions_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'll": "that will",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'd": "who would",
    "who'll": "who will",
    "who's": "who is",
    "who've": "who have",
    "why'd": "why did",
    "why'll": "why will",
    "why's": "why is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

#doc = list(contradictions.text)
#docs = ' '.join(doc)

contradictions['text'] = contradictions['text'].apply(lambda x: tmu.resolve_contractions(x, contractions_dict))
contradictions.count()

text           42584
isSarcastic    42584
dtype: int64

In [15]:
from nltk.stem import PorterStemmer
stemmed_docs = lower_docs.copy()
stemmed_docs.text = lower_docs.text.apply(tmu.stem_doc, stemmer=PorterStemmer())

In [71]:
digits_removed = lower_docs.copy()
digits_removed['text'] = digits_removed['text'].apply(lambda x: tmu.remove_d(x))


In [86]:
token_len = lower_docs.copy()
n = 2
token_len['text'] = token_len['text'].apply(lambda x: tmu.remove_by_token_len(x, n))

In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

stop_words_removed_dataset = stemmed_docs.copy()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

stop_words_removed_dataset['text'] = stemmed_docs['text'].apply(remove_stopwords)
stop_words_removed_dataset.count()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


text           21292
isSarcastic    21292
dtype: int64

In [64]:
replace_emoji = contradictions.copy()
replace_emoji = prep_funct.replace_emoji_emoticons(contradictions)

In [65]:

removed_abbreviations = replace_emoji.copy()
removed_abbreviations = prep_funct.replace_abbreviations(replace_emoji)

In [42]:
import My_Preprocessing as prep_funct

In [14]:
replace_emoji = lower_docs.copy()
replace_emoji = prep_funct.replace_emoji_emoticons(lower_docs)

In [15]:
import nltk
from nltk.corpus import wordnet
import pandas as pd

# Download WordNet if not already downloaded
nltk.download('wordnet')

def create_replacement_dict_from_dataset(dataset):
    repl_dict = {}
    for text in dataset['text']:
        # Tokenization
        tokens = nltk.word_tokenize(text)
        # POS Tagging
        tagged_tokens = nltk.pos_tag(tokens)
        
        for token, pos_tag in tagged_tokens:
            # Extract synonyms based on POS tag (considering only nouns)
            if pos_tag.startswith('NN'):  # Noun
                synonyms = set()
                for syn in wordnet.synsets(token):
                    for lemma in syn.lemmas():
                        synonyms.add(lemma.name())
                # Remove the original token from the set of synonyms
                synonyms.discard(token)
                # Update the replacement dictionary
                if token in repl_dict:
                    repl_dict[token].extend(synonyms)
                else:
                    repl_dict[token] = list(synonyms)
    return repl_dict

# Load your dataset into a pandas DataFrame (replace 'your_dataset.csv' with your actual dataset file)
replace_Synanims = lower_docs.copy()

# Create replacement dictionary from the dataset
replacement_dict = create_replacement_dict_from_dataset(replace_Synanims)

print("Replacement Dictionary:")
print(replacement_dict)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Replacement Dictionary:


In [16]:
replace_Synanims['text'] = replace_Synanims['text'].apply(lambda x: tmu.improve_bow(x, replacement_dict))

KeyboardInterrupt: 

In [10]:
###############################################################
#check model results
import text_mining_utils as tmu

In [66]:
baseline_count_matrix = tmu.build_count_matrix(removed_abbreviations.text)
baseline_count_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌸,󾍀,󾍃,󾓤,󾬑,󾭞,󾭻,󾮗,󾮚,󾮟
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
baseline_tf_matrix = tmu.build_tf_matrix(removed_abbreviations.text)
baseline_tf_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌸,󾍀,󾍃,󾓤,󾬑,󾭞,󾭻,󾮗,󾮚,󾮟
0,0.034483,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0625,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.076923,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
baseline_tfidf_matrix = tmu.build_tfidf_matrix(removed_abbreviations.text)
baseline_tfidf_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌸,󾍀,󾍃,󾓤,󾬑,󾭞,󾭻,󾮗,󾮚,󾮟
0,0.088112,0.0,0.0,0.0,0.0,0.123217,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.132345,0.0,0.111503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.237745,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.141931,0.0,0.239159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=1)

y = contradictions.isSarcastic

tmu.printClassifReport(dt_clf, baseline_count_matrix, y)
#tmu.printClassifReport(dt_clf, baseline_tf_matrix, y)
#tmu.printClassifReport(dt_clf, baseline_tfidf_matrix, y)

              precision    recall  f1-score   support

           0       0.74      0.70      0.72     21292
           1       0.72      0.75      0.73     21292

    accuracy                           0.73     42584
   macro avg       0.73      0.73      0.73     42584
weighted avg       0.73      0.73      0.73     42584



In [12]:

tmu.plot_avg_performance_for_3matrices(dt_clf, "Decision Tree", [baseline_count_matrix, baseline_tf_matrix, baseline_tfidf_matrix], ["Count", "TF", "TFIDF"], y)

KeyboardInterrupt: 