In [45]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
import My_Preprocessing as prep_funct
import text_mining_utils as tmu

In [46]:
#prep_funct.remove_Sarcasm_hashtag()

dataset = pd.read_json("cleaned_#sarcasm.json")

print(dataset.count())

sarcastic_counts = dataset['isSarcastic'].value_counts()

print()
print()

# Display result
print("Number of rows for each value in the 'isSarcastic' column:")
print(sarcastic_counts)

isSarcastic    39780
text           39780
dtype: int64


Number of rows for each value in the 'isSarcastic' column:
isSarcastic
0    21292
1    18488
Name: count, dtype: int64


In [47]:
dataset = prep_funct.random_oversampling(dataset)

class_counts = dataset['isSarcastic'].value_counts()

# Display the number of rows for each class
print("Number of rows for each class:")
print(class_counts)

Number of rows for each class:
isSarcastic
0    21292
1    21292
Name: count, dtype: int64


In [44]:
# Shuffle the dataset
dataset_shuffled = dataset.sample(frac=1, random_state=42)

data_sample = len(dataset) // 2

dataset = dataset_shuffled.iloc[:data_sample]
total_rows = len(dataset)
print(total_rows)

21292


In [48]:

clean_operations = {
r'(\(.+?\))+' : '', ## paranthetical notes to be replaced by empty string
r'(\[.+?\])+' : '', ## numbered citations to be replaced by empty string
r'\s+' : ' ', ## any type of white space to be replaced with a single white space
r'\s{2,}' : ' ', ## 2 or more consecutive white spaces to be replaced with a single white spac
}

clean_data = dataset.copy()

clean_data.text = clean_data.text.apply(tmu.clean_doc, clean_operations=clean_operations)
clean_data.head()

Unnamed: 0,text,isSarcastic
0,@0430yes i hope youre lurking rn. i want to li...,0
1,05 really taught me a valuable lesson I'm neve...,0
2,"@098BERRY Never had a voice to protest, so you...",0
3,@0hMySt4rs Rest in peace & love to you and you...,0
4,100 days until Christmas! 🌲 #too soon #not rea...,0


In [49]:
lower_docs = clean_data.copy()
lower_docs.text = lower_docs.text.apply(str.lower)

In [31]:
contradictions = lower_docs.copy()

#define dictionary, couldn't find library for this
contractions_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'll": "that will",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'd": "who would",
    "who'll": "who will",
    "who's": "who is",
    "who've": "who have",
    "why'd": "why did",
    "why'll": "why will",
    "why's": "why is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

#doc = list(contradictions.text)
#docs = ' '.join(doc)

contradictions['text'] = contradictions['text'].apply(lambda x: tmu.resolve_contractions(x, contractions_dict))
contradictions.count()

text           42584
isSarcastic    42584
dtype: int64

In [50]:
replace_emoji = lower_docs.copy()
replace_emoji = prep_funct.replace_emoji_emoticons(lower_docs)

In [51]:
removed_abbreviations = replace_emoji.copy()
removed_abbreviations = prep_funct.replace_abbreviations(replace_emoji)

In [52]:
replace_user_mentions = removed_abbreviations.copy
replace_user_mentions = prep_funct.replace_user_mentions(removed_abbreviations)

In [53]:
baseline_count_matrix = tmu.build_count_matrix(replace_user_mentions.text)
baseline_count_matrix.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,󾌸,󾍀,󾍃,󾓤,󾬑,󾭞,󾭻,󾮗,󾮚,󾮟
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=1)

y = replace_user_mentions.isSarcastic
x = replace_user_mentions.text

tmu.printClassifReport(dt_clf, baseline_count_matrix, y)

KeyboardInterrupt: 

In [55]:
from sklearn.tree import DecisionTreeClassifier

y = replace_user_mentions.isSarcastic

dt = DecisionTreeClassifier(random_state=1)
tree_weighted_matrix = tmu.clf_univariate_fs(baseline_count_matrix, y, dt,
num_features=25, scores_to_print=25)

Top 25 features:
       Attribute    Weight
22657        not  0.050386
828            @  0.040777
2              #  0.040449
26886    retweet  0.031324
13             .  0.015601
35873  yeahright  0.009753
15351          i  0.009721
19456       love  0.009513
19117  literally  0.007720
14280       hate  0.007300
36008        you  0.007002
822            :  0.006802
32611         to  0.006377
11             ,  0.006039
26018       rain  0.005585
1              "  0.005464
18980       like  0.005386
1002           a  0.005117
35011       when  0.004945
0              !  0.004862
21751         my  0.004680
31957        the  0.004405
2120         and  0.004349
15516         if  0.004250
20442         me  0.004209


In [67]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Convert text data to numerical format using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(replace_user_mentions['text'])
dt_c = DecisionTreeClassifier(random_state=1)

y = replace_user_mentions['isSarcastic']

X_reduced = tree_weighted_matrix

# Define the hyperparameter grid
param_grid = {
    'max_depth': range(600, 700), #[10, 100, 400, 600, 800, 900]
    'min_samples_split': range(2, 5),
    'min_samples_leaf': range(1, 3),
    'criterion': ['gini', 'entropy']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(dt_c,
                           param_grid=param_grid,
                           cv=15,
                           scoring='accuracy',
                           verbose=1,
                           n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)
print(grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_
print(best_model)

Fitting 15 folds for each of 1200 candidates, totalling 18000 fits


KeyboardInterrupt: 

In [68]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

#Define the best hyperparameters
best_params = {
    'criterion':'gini',
    'max_depth': 600, 
    'min_samples_split': 3
}


model = DecisionTreeClassifier(random_state=1, **best_params)

#cross-validation
cv_scores = cross_val_score(model, tree_weighted_matrix, y, cv=15)

#Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.74145826 0.7284255  0.72701656 0.71187038 0.71046143 0.64600211
 0.67241987 0.7284255  0.76400141 0.80943994 0.66924974 0.74075379
 0.7312434  0.86720676 0.8111346 ]
Mean CV score: 0.7372739493859769


550,
Cross-validation scores: [0.74216273 0.72243748 0.7245509  0.71961958 0.71468827 0.63402607
 0.68263473 0.72701656 0.75589996 0.8119056  0.66326171 0.72560761
 0.72595985 0.85910532 0.81606765]
Mean CV score: 0.734996267879612

600,

Cross-validation scores: [0.75061641 0.721733   0.72490313 0.70975696 0.7139838  0.63860514
 0.67241987 0.73441353 0.76224023 0.81754139 0.662205   0.73793589
 0.73089116 0.86157098 0.80479211]
Mean CV score: 0.7362405727863593

800,
Cross-validation scores: [0.74850299 0.7245509  0.72067629 0.71081367 0.71257485 0.64283198
 0.67981684 0.72525537 0.75695667 0.80204297 0.66784079 0.73934484
 0.72772103 0.86227545 0.81324877]
Mean CV score: 0.7356302277507747

In [69]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=1, **best_params)

y = replace_user_mentions.isSarcastic
x = replace_user_mentions.text

tmu.printClassifReport(model, tree_weighted_matrix, y)

              precision    recall  f1-score   support

           0       0.75      0.72      0.73     21292
           1       0.73      0.76      0.75     21292

    accuracy                           0.74     42584
   macro avg       0.74      0.74      0.74     42584
weighted avg       0.74      0.74      0.74     42584



In [70]:
dt_results = pd.DataFrame(grid_search.cv_results_)
dt_results.head(15)

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'