In [39]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
import My_Preprocessing as prep_funct
import text_mining_utils as tmu

In [40]:
#prep_funct.remove_Sarcasm_hashtag()

dataset = pd.read_json("cleaned_#sarcasm.json")

print(dataset.count())

# Assuming "is_sarcastic" is the column you're interested in
sarcastic_counts = dataset['isSarcastic'].value_counts()

print()
print()

# Display the result
print("Number of rows for each value in the 'isSarcastic' column:")
print(sarcastic_counts)

isSarcastic    39780
text           39780
dtype: int64


Number of rows for each value in the 'isSarcastic' column:
isSarcastic
0    21292
1    18488
Name: count, dtype: int64


In [41]:
dataset = prep_funct.random_oversampling(dataset)

# Assuming 'isSarcastic' is the column denoting the class labels
class_counts = dataset['isSarcastic'].value_counts()

# Display the number of rows for each class
print("Number of rows for each class:")
print(class_counts)

Number of rows for each class:
isSarcastic
0    21292
1    21292
Name: count, dtype: int64


In [42]:
# Shuffle the dataset
dataset_shuffled = dataset.sample(frac=1, random_state=42)

data_sample = len(dataset) // 2

# Take the first quarter of the shuffled dataset
dataset = dataset_shuffled.iloc[:data_sample]

# Now, quarter_dataset contains a representative 1/4 subset of your original dataset


total_rows = len(dataset)
print(total_rows)

21292


In [43]:

clean_operations = {
r'(\(.+?\))+' : '', ## paranthetical notes to be replaced by empty string
r'(\[.+?\])+' : '', ## numbered citations to be replaced by empty string
r'\s+' : ' ', ## any type of white space to be replaced with a single white space
r'\s{2,}' : ' ', ## 2 or more consecutive white spaces to be replaced with a single white spac
}

clean_data = dataset.copy()

clean_data.text = clean_data.text.apply(tmu.clean_doc, clean_operations=clean_operations)
clean_data.head()

Unnamed: 0,text,isSarcastic
30956,Love waking up to angry work email. #Sarcastic...,1
42309,@SA_StuAffairs We teach on campus sushi classe...,1
20397,With all these comebacks I'm ready to make vin...,0
11960,Lost all my mac lipsticks not slightly heartbr...,0
39742,you text me I respond in 15 seconds then appar...,1


In [44]:
lower_docs = clean_data.copy()
lower_docs.text = lower_docs.text.apply(str.lower)

In [None]:
replace_emoji = lower_docs.copy()
replace_emoji = prep_funct.replace_emoji_emoticons(lower_docs)

In [45]:
removed_abbreviations = replace_emoji.copy()
removed_abbreviations = prep_funct.replace_abbreviations(replace_emoji)

In [60]:
baseline_count_matrix = tmu.build_count_matrix(removed_abbreviations.text)
baseline_count_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌴,󾌵,󾌸,󾍀,󾍃,󾓤,󾭞,󾭻,󾮗,󾮚
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
baseline_tf_matrix = tmu.build_tf_matrix(removed_abbreviations.text)
baseline_tf_matrix.head()

MemoryError: Unable to allocate 3.77 GiB for an array with shape (21292, 23748) and data type int64

In [None]:
baseline_tfidf_matrix = tmu.build_tfidf_matrix(removed_abbreviations.text)
baseline_tfidf_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌴,󾌵,󾌸,󾍀,󾍃,󾓤,󾭞,󾭻,󾮗,󾮚
0,0.0,0.0,0.2566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.121421,0.0,0.0,0.0,0.0,0.167072,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=1)

y = removed_abbreviations.isSarcastic

tmu.printClassifReport(dt_clf, baseline_count_matrix, y)
tmu.printClassifReport(dt_clf, baseline_tf_matrix, y)
tmu.printClassifReport(dt_clf, baseline_tfidf_matrix, y)

              precision    recall  f1-score   support

           0       0.73      0.73      0.73     10641
           1       0.73      0.73      0.73     10651

    accuracy                           0.73     21292
   macro avg       0.73      0.73      0.73     21292
weighted avg       0.73      0.73      0.73     21292

              precision    recall  f1-score   support

           0       0.71      0.71      0.71     10641
           1       0.71      0.71      0.71     10651

    accuracy                           0.71     21292
   macro avg       0.71      0.71      0.71     21292
weighted avg       0.71      0.71      0.71     21292

              precision    recall  f1-score   support

           0       0.69      0.70      0.70     10641
           1       0.70      0.69      0.70     10651

    accuracy                           0.70     21292
   macro avg       0.70      0.70      0.70     21292
weighted avg       0.70      0.70      0.70     21292



In [51]:
from sklearn.feature_selection import chi2, f_classif

In [52]:
count_chi2_matrix = tmu.stat_univariate_fs(baseline_count_matrix, y, weight_method=chi2,
selection_method='k_best',
num_features=50, scores_to_print=50
)

Top 50 features:
             Attribute      Weight
557                  @  952.314605
14764              not  879.842677
17583          retweet  622.920848
2                    #  260.471292
1                    "  256.510131
551                  :  226.698449
0                    !  213.046605
13352               me  158.990876
13                   .  148.260295
9404             happy  135.012760
9                    *  134.540092
9484              hate  122.783989
2761          birthday  118.833540
12427             like  109.137330
22870             when  108.713097
23432        yeahright  106.037074
10284               if   80.714532
10021             hour   68.379683
22125               up   66.565153
19431          someone   65.747612
8148           forward   65.182690
13869           moment   64.814340
22548             wake   64.583162
12524        literally   62.937663
9904          homework   62.188536
656                  a   61.650284
4391           classes   61.636750
520

In [53]:
# using anova
count_anova_matrix = tmu.stat_univariate_fs(baseline_count_matrix, y, weight_method=f_classif,
selection_method='k_best',
num_features=50, scores_to_print=50
)


Top 50 features:
             Attribute      Weight
14764              not  962.277820
557                  @  874.029047
17583          retweet  668.444150
0                    !  157.538848
2                    #  146.714036
9404             happy  129.726984
13352               me  121.470569
2761          birthday  113.006288
22870             when  109.611769
9484              hate  108.545891
551                  :  108.367363
23432        yeahright  107.134267
12427             like  102.705705
1                    "   96.364180
13                   .   89.417378
10284               if   79.182847
8148           forward   65.882620
10021             hour   64.275013
19431          someone   64.082727
12524        literally   63.925331
13869           moment   63.765423
22548             wake   60.479611
4391           classes   59.061636
656                  a   57.385197
9904          homework   56.264197
12125          laughed   55.835414
18769           shitty   55.806169
221

In [55]:
## create the models and get performance
tmu.printClassifReport(dt_clf, count_chi2_matrix, y)
tmu.printClassifReport(dt_clf, count_anova_matrix, y)

              precision    recall  f1-score   support

           0       0.66      0.76      0.70     10641
           1       0.71      0.61      0.66     10651

    accuracy                           0.68     21292
   macro avg       0.69      0.68      0.68     21292
weighted avg       0.69      0.68      0.68     21292

              precision    recall  f1-score   support

           0       0.66      0.76      0.71     10641
           1       0.72      0.61      0.66     10651

    accuracy                           0.68     21292
   macro avg       0.69      0.68      0.68     21292
weighted avg       0.69      0.68      0.68     21292



In [56]:
count_matrix = tmu.build_count_matrix(removed_abbreviations.text)
count_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌴,󾌵,󾌸,󾍀,󾍃,󾓤,󾭞,󾭻,󾮗,󾮚
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
from sklearn.tree import DecisionTreeClassifier

y = removed_abbreviations.isSarcastic

dt = DecisionTreeClassifier(random_state=1)
tree_weighted_matrix = tmu.clf_univariate_fs(count_matrix, y, dt,
num_features=25, scores_to_print=25)

Top 25 features:
       Attribute    Weight
14764        not  0.047774
557            @  0.045809
2              #  0.038992
17583    retweet  0.030833
13             .  0.012714
20882        the  0.010673
21294         to  0.008680
12734       love  0.008639
9484        hate  0.008505
23432  yeahright  0.008462
10181          i  0.007761
12524  literally  0.007619
23525        you  0.007130
1              "  0.006373
15077         of  0.006128
11             ,  0.005818
13352         me  0.005792
14192         my  0.005398
10855         it  0.005190
551            :  0.005037
656            a  0.004803
0              !  0.004717
16975       rain  0.004538
12427       like  0.004438
1396         and  0.004432


In [59]:
tmu.printClassifReport(dt, tree_weighted_matrix, y)

              precision    recall  f1-score   support

           0       0.73      0.73      0.73     10641
           1       0.73      0.73      0.73     10651

    accuracy                           0.73     21292
   macro avg       0.73      0.73      0.73     21292
weighted avg       0.73      0.73      0.73     21292

