In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
import My_Preprocessing as prep_funct
import text_mining_utils as tmu

In [3]:
#prep_funct.remove_Sarcasm_hashtag()

dataset = pd.read_json("cleaned_#sarcasm.json")

print(dataset.count())

sarcastic_counts = dataset['isSarcastic'].value_counts()

print()
print()

# Display result
print("Number of rows for each value in the 'isSarcastic' column:")
print(sarcastic_counts)

isSarcastic    39780
text           39780
dtype: int64


Number of rows for each value in the 'isSarcastic' column:
isSarcastic
0    21292
1    18488
Name: count, dtype: int64


In [4]:
dataset = prep_funct.random_oversampling(dataset)

class_counts = dataset['isSarcastic'].value_counts()

print("Number of rows for each class:")
print(class_counts)

Number of rows for each class:
isSarcastic
0    21292
1    21292
Name: count, dtype: int64


In [5]:
# Shuffle the dataset
dataset_shuffled = dataset.sample(frac=1, random_state=42)

data_sample = len(dataset) // 2

dataset = dataset_shuffled.iloc[:data_sample]


total_rows = len(dataset)
print(total_rows)

21292


In [5]:

clean_operations = {
r'(\(.+?\))+' : '', ## paranthetical notes to be replaced by empty string
r'(\[.+?\])+' : '', ## numbered citations to be replaced by empty string
r'\s+' : ' ', ## any type of white space to be replaced with a single white space
r'\s{2,}' : ' ', ## 2 or more consecutive white spaces to be replaced with a single white spac
}

clean_data = dataset.copy()

clean_data.text = clean_data.text.apply(tmu.clean_doc, clean_operations=clean_operations)
clean_data.head()

Unnamed: 0,text,isSarcastic
0,@0430yes i hope youre lurking rn. i want to li...,0
1,05 really taught me a valuable lesson I'm neve...,0
2,"@098BERRY Never had a voice to protest, so you...",0
3,@0hMySt4rs Rest in peace & love to you and you...,0
4,100 days until Christmas! 🌲 #too soon #not rea...,0


In [6]:
lower_docs = clean_data.copy()
lower_docs.text = lower_docs.text.apply(str.lower)

In [7]:
replace_emoji = lower_docs.copy()
replace_emoji = prep_funct.replace_emoji_emoticons(lower_docs)

In [8]:
removed_abbreviations = replace_emoji.copy()
removed_abbreviations = prep_funct.replace_abbreviations(replace_emoji)

In [53]:
contradictions = removed_abbreviations.copy()

contractions_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'll": "that will",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'd": "who would",
    "who'll": "who will",
    "who's": "who is",
    "who've": "who have",
    "why'd": "why did",
    "why'll": "why will",
    "why's": "why is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

doc = list(contradictions.text)
docs = ' '.join(doc)

# fix contractions in text
contradictions['text'] = contradictions['text'].apply(lambda x: tmu.resolve_contractions(x, contractions_dict))

In [9]:
replace_user_mentions = removed_abbreviations.copy
replace_user_mentions = prep_funct.replace_user_mentions(removed_abbreviations)

In [9]:
baseline_count_matrix = tmu.build_count_matrix(replace_user_mentions.text)
baseline_count_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌸,󾍀,󾍃,󾓤,󾬑,󾭞,󾭻,󾮗,󾮚,󾮟
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
baseline_tf_matrix = tmu.build_tf_matrix(replace_user_mentions.text)
baseline_tf_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌴,󾌵,󾌸,󾍀,󾍃,󾓤,󾭞,󾭻,󾮗,󾮚
0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.05,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
baseline_tfidf_matrix = tmu.build_tfidf_matrix(replace_user_mentions.text)
baseline_tfidf_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌴,󾌵,󾌸,󾍀,󾍃,󾓤,󾭞,󾭻,󾮗,󾮚
0,0.0,0.0,0.254885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.119172,0.0,0.0,0.0,0.0,0.166887,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=1)

y = replace_user_mentions.isSarcastic

tmu.printClassifReport(dt_clf, baseline_count_matrix, y)
#tmu.printClassifReport(dt_clf, baseline_tf_matrix, y)
#tmu.printClassifReport(dt_clf, baseline_tfidf_matrix, y)

              precision    recall  f1-score   support

           0       0.75      0.71      0.73     21292
           1       0.72      0.76      0.74     21292

    accuracy                           0.73     42584
   macro avg       0.74      0.73      0.73     42584
weighted avg       0.74      0.73      0.73     42584



In [11]:
from sklearn.feature_selection import chi2, f_classif

In [12]:
count_chi2_matrix = tmu.stat_univariate_fs(baseline_count_matrix, y, weight_method=chi2,
selection_method='k_best',
num_features=50, scores_to_print=25
)

Top 25 features:
       Attribute       Weight
22657        not  1824.979176
828            @  1710.368519
26886    retweet  1221.587745
1              "   570.699501
2              #   553.544532
822            :   505.579250
0              !   474.744497
20442         me   321.612486
13             .   280.745783
9              *   278.118812
14153      happy   276.768595
35873  yeahright   263.229391
14280       hate   236.845865
35011       when   234.404584
18980       like   216.996930
4213    birthday   206.267782
15516         if   162.292119
26409  red_heart   152.802867
12202    forward   141.381818
33876         up   136.666340
14934   homework   136.515030
7843      credit   128.355556
19117  literally   128.261950
29301      sleep   122.255489
26018       rain   120.849162


In [13]:
# using anova
count_anova_matrix = tmu.stat_univariate_fs(baseline_count_matrix, y, weight_method=f_classif,
selection_method='k_best',
num_features=50, scores_to_print=25
)


Top 25 features:
       Attribute       Weight
22657        not  2020.502497
828            @  1570.109117
26886    retweet  1297.627559
0              !   338.049642
2              #   310.752780
35873  yeahright   266.611842
14153      happy   261.150540
822            :   244.341096
20442         me   242.422098
35011       when   233.605452
14280       hate   216.210016
1              "   210.165508
18980       like   201.844337
4213    birthday   191.692966
13             .   166.973356
15516         if   158.962953
12202    forward   142.957642
19117  literally   129.744024
14934   homework   124.404026
33876         up   120.462458
15097       hour   114.288791
26018       rain   113.258828
11220   facebook   112.984739
18535    laughed   112.744013
29301      sleep   111.607361


In [13]:
## create the models and get performance
tmu.printClassifReport(dt_clf, count_chi2_matrix, y)
tmu.printClassifReport(dt_clf, count_anova_matrix, y)

NameError: name 'dt_clf' is not defined

In [10]:
count_matrix = tmu.build_count_matrix(replace_user_mentions.text)
count_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌸,󾍀,󾍃,󾓤,󾬑,󾭞,󾭻,󾮗,󾮚,󾮟
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.tree import DecisionTreeClassifier

y = replace_user_mentions.isSarcastic

dt = DecisionTreeClassifier(random_state=1)
tree_weighted_matrix = tmu.clf_univariate_fs(count_matrix, y, dt,
num_features=25, scores_to_print=25)

Top 25 features:
       Attribute    Weight
22657        not  0.050386
828            @  0.040777
2              #  0.040449
26886    retweet  0.031324
13             .  0.015601
35873  yeahright  0.009753
15351          i  0.009721
19456       love  0.009513
19117  literally  0.007720
14280       hate  0.007300
36008        you  0.007002
822            :  0.006802
32611         to  0.006377
11             ,  0.006039
26018       rain  0.005585
1              "  0.005464
18980       like  0.005386
1002           a  0.005117
35011       when  0.004945
0              !  0.004862
21751         my  0.004680
31957        the  0.004405
2120         and  0.004349
15516         if  0.004250
20442         me  0.004209


In [12]:
tmu.printClassifReport(dt, tree_weighted_matrix, y)

              precision    recall  f1-score   support

           0       0.75      0.71      0.73     21292
           1       0.73      0.77      0.75     21292

    accuracy                           0.74     42584
   macro avg       0.74      0.74      0.74     42584
weighted avg       0.74      0.74      0.74     42584

