In [2]:
import numpy as np 
import pandas as pd 
import nltk
import os
import re
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM, Bidirectional
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [3]:
data_1 = pd.read_json("Dataset\\Spam_Detection.json", lines=True)
del data_1["context"]
data_2 = pd.read_json("Dataset\\Spam_Headlines_Dataset.json", lines=True)
del data_2["article_link"]
data =  pd.concat([data_1,data_2])
data.head()
print(data)

       is_sarcastic                                           headline
0                 1  @USER @USER @USER I don't get this .. obviousl...
1                 1  @USER @USER trying to protest about . Talking ...
2                 1  @USER @USER @USER He makes an insane about of ...
3                 1  @USER @USER Meanwhile Trump won't even release...
4                 1  @USER @USER Pretty Sure the Anti-Lincoln Crowd...
...             ...                                                ...
28614             1       jews to celebrate rosh hashasha or something
28615             1  internal affairs investigator disappointed con...
28616             0  the most beautiful acceptance speech this week...
28617             1  mars probe destroyed by orbiting spielberg-gat...
28618             1                 dad clarifies this not a food stop

[33619 rows x 2 columns]


# Pre Processing

In [4]:
def clean_text(text):
    text = text.lower()
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))           #remove @
    emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"                     # emoticons
                           u"\U0001F300-\U0001F5FF"                     # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"                     # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"                     # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji.sub(r'', text)
    text = text.lower()
    # Limitation and stamming 
    # convert " ' " to actual word
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)        
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text) 
    text = re.sub(r"\'ll", " will", text)  
    text = re.sub(r"\'ve", " have", text)  
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"have't", "have not", text)
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [5]:
import string
from nltk.tokenize import word_tokenize     #analyse and tekonize texts
from nltk.corpus import stopwords           #remove  some words without value 'the,and,of'

def CleanTokenize(df):
    head_lines = list()
    lines = df["headline"].values.tolist()       #lines = data["headline"]     #a=pd.array(lines, dtype="string")
    for line in lines:
        line = clean_text(line)
        tokens = word_tokenize(line)                            # tokenize the text
        table = str.maketrans('', '', string.punctuation)       # remove puntuations
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]   # remove non alphabetic characters
        stop_words = set(stopwords.words("english"))            # remove stop words
        words = [w for w in words if not w in stop_words]
        head_lines.append(words)
    return head_lines

head_lines = CleanTokenize(data) #store the cleaned and tokenized headlines in the head_lines variable
head_lines[0:1]
print(type(head_lines))

<class 'list'>


In [6]:
validation_split = 0.2  #sets the fraction of the data that will be used for validation
max_length = 25         #sets the maximum length of the input sequences
tokenizer_obj = Tokenizer() #initializes a Tokenizer object.
tokenizer_obj.fit_on_texts(head_lines) # fits the Tokenizer object on the tokenized and cleaned text data. 
sequences = tokenizer_obj.texts_to_sequences(head_lines) #converts the tokenized sequences to numerical sequences using the fitted Tokenizer object.

word_index = tokenizer_obj.word_index  #The word_index dictionary contains key-value pairs where each word in the text data is a key and its corresponding integer index is the value.
print("unique tokens - ",len(word_index)) #The len(word_index) function returns the number of unique words in the text data.
vocab_size = len(tokenizer_obj.word_index) + 1 #The vocab_size variable is initialized to the number of unique words in the text data plus one. This is because the integer index starts from 1 and not 0. The vocab_size indicates the size of the vocabulary that will be used to train the machine learning model.
print('vocab size -', vocab_size)

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')                                          #pad_sequences function is used to pad the sequences to a fixed length of max_length. The sequences variable contains the tokenized sequences for each text headline. padding='post' means that the padding will be added to the end of each sequence.
sentiment =  data['is_sarcastic'].values #extracts the labels for each text from the data object and assigns them to the sentiment variable. This creates a NumPy array containing the sentiment labels for all the headlines in the dataset.

indices = np.arange(lines_pad.shape[0]) #creates an array of indices from 0 to the number of rows in the lines_pad array
np.random.shuffle(indices) #shuffles the indices randomly
lines_pad = lines_pad[indices] #use the shuffled indices to rearrange the order of the rows in the lines_pad and sentiment arrays, so that the rows are in a different order than they were before.
sentiment = sentiment[indices]

num_validation_samples = int(validation_split * lines_pad.shape[0])#                                                         stores the number of validation samples, which is calculated by multiplying the validation split value with the total number of padded sequences.

X_train = lines_pad[:-num_validation_samples]    
y_train = sentiment[:-num_validation_samples]
X_test = lines_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

unique tokens -  33331
vocab size - 33332


In [7]:
print('Shape of X_train:' , X_train.shape)
print('Shape of y_train:' , y_train.shape)
print('Shape of X_test:'  , X_test.shape)
print('Shape of y_test:'  , y_test.shape)

Shape of X_train: (26896, 25)
Shape of y_train: (26896,)
Shape of X_test: (6723, 25)
Shape of y_test: (6723,)


In [8]:
embeddings_index = {}
embedding_dim = 100
GLOVE_DIR = "Dataset\\GLOVE.txt"
f = open(GLOVE_DIR, encoding = "utf-8")
for line in f:
    values = line.split()                                     #splitting each line into a list of values.
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')           #convert the text after word to array
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))       #prints the number of word vectors that were found in the file.   

Found 4907 word vectors.


In [9]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
c = 0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        c+=1
        embedding_matrix[i] = embedding_vector
print(c)
#The variable c keeps track of words Number for which the embedding vectors are found in the embeddings_index dictionary

2292


In [10]:
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,                #the size of the embedding vector.
                            weights=[embedding_matrix],   #the pre-trained embedding matrix.
                            input_length=max_length,      #the length of the input sequences.
                            trainable=False)  #the weights of the embedding layer are frozen and wont be updated during training

In [11]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print('Summary of the built model...')
print(model.summary())

Summary of the built model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 100)           3333200   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                42240     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 3,375,505
Trainable params: 42,305
Non-trainable params: 3,333,200
_________________________________________________________________
None


In [12]:
history = model.fit(X_train, y_train, batch_size=32, epochs=25, validation_data=(X_test, y_test), verbose=2)

Epoch 1/25
841/841 - 2101s - loss: 0.6400 - acc: 0.6295 - val_loss: 0.6116 - val_acc: 0.6621
Epoch 2/25
841/841 - 1897s - loss: 0.5994 - acc: 0.6723 - val_loss: 0.5896 - val_acc: 0.6748
Epoch 3/25
841/841 - 590s - loss: 0.5816 - acc: 0.6842 - val_loss: 0.5784 - val_acc: 0.6866
Epoch 4/25
841/841 - 578s - loss: 0.5693 - acc: 0.6916 - val_loss: 0.5775 - val_acc: 0.6906
Epoch 5/25
841/841 - 675s - loss: 0.5576 - acc: 0.6987 - val_loss: 0.5779 - val_acc: 0.6867
Epoch 6/25
841/841 - 847s - loss: 0.5482 - acc: 0.7090 - val_loss: 0.5826 - val_acc: 0.6933
Epoch 7/25
841/841 - 640s - loss: 0.5408 - acc: 0.7151 - val_loss: 0.5625 - val_acc: 0.6985
Epoch 8/25
841/841 - 565s - loss: 0.5304 - acc: 0.7216 - val_loss: 0.5681 - val_acc: 0.6918
Epoch 9/25
841/841 - 565s - loss: 0.5276 - acc: 0.7248 - val_loss: 0.5800 - val_acc: 0.6955
Epoch 10/25
841/841 - 572s - loss: 0.5188 - acc: 0.7316 - val_loss: 0.5566 - val_acc: 0.7004
Epoch 11/25
841/841 - 555s - loss: 0.5109 - acc: 0.7351 - val_loss: 0.5639 - 

In [13]:
def predict_sarcasm(s):
    x_final = pd.DataFrame({"headline":[s]})
    test_lines = CleanTokenize(x_final)
    test_sequences = tokenizer_obj.texts_to_sequences(test_lines)
    test_review_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    pred = model.predict(test_review_pad)
    pred*=100
    if pred[0][0]>=50: return "Spam!"
    else: return "Not Spam."

In [14]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

def apply_stemming(text):
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

def apply_lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Applying stemming and lemmatization to the headlines
stemmed_headlines = [apply_stemming(" ".join(tokens)) for tokens in head_lines]
lemmatized_headlines = [apply_lemmatization(" ".join(tokens)) for tokens in head_lines]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Applying Bag of Words (BOW)
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(lemmatized_headlines)

# Applying TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(lemmatized_headlines)

In [16]:
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Applying K-Means Clustering (k=2)
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train)
kmeans_labels = kmeans.predict(X_test)
accuracy_kmeans = accuracy_score(y_test, kmeans_labels)
print(f"K-Means Clustering - Accuracy: {accuracy_kmeans:.4f}")

# Applying Decision Tree Classifier
model2 = DecisionTreeClassifier(random_state=42)
model2.fit(X_train, y_train)
y_pred_dt = model2.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree - Accuracy: {accuracy_dt:.4f}")

# Applying Random Forest Classifier
model3 = RandomForestClassifier(random_state=42)
model3.fit(X_train, y_train)
y_pred_rf = model3.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest - Accuracy: {accuracy_rf:.4f}")

# Applying Naive Bayes Classifier
model4 = MultinomialNB()
model4.fit(X_train, y_train)
y_pred_nb = model4.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes - Accuracy: {accuracy_nb:.4f}")

# Applying Support Vector Machine (SVM)
model5 = SVC()
model5.fit(X_train, y_train)
y_pred_svm = model5.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Support Vector Machine - Accuracy: {accuracy_svm:.4f}")

K-Means Clustering - Accuracy: 0.5142
Decision Tree - Accuracy: 0.5770
Random Forest - Accuracy: 0.6220
Naive Bayes - Accuracy: 0.5521
Support Vector Machine - Accuracy: 0.5691


In [17]:
history = model.fit(X_train, y_train, batch_size=32, epochs=23, validation_data=(X_test, y_test), verbose=2)

Epoch 1/23
841/841 - 542s - loss: 0.4416 - acc: 0.7756 - val_loss: 0.6001 - val_acc: 0.7050
Epoch 2/23
841/841 - 540s - loss: 0.4434 - acc: 0.7743 - val_loss: 0.6016 - val_acc: 0.7071
Epoch 3/23
841/841 - 541s - loss: 0.4374 - acc: 0.7794 - val_loss: 0.5990 - val_acc: 0.7083
Epoch 4/23
841/841 - 627s - loss: 0.4341 - acc: 0.7848 - val_loss: 0.6173 - val_acc: 0.7107
Epoch 5/23
841/841 - 33976s - loss: 0.4310 - acc: 0.7829 - val_loss: 0.6098 - val_acc: 0.7086
Epoch 6/23
841/841 - 527s - loss: 0.4270 - acc: 0.7849 - val_loss: 0.6190 - val_acc: 0.7104
Epoch 7/23
841/841 - 553s - loss: 0.4236 - acc: 0.7901 - val_loss: 0.6175 - val_acc: 0.7120
Epoch 8/23
841/841 - 505s - loss: 0.4235 - acc: 0.7880 - val_loss: 0.6207 - val_acc: 0.7120
Epoch 9/23
841/841 - 482s - loss: 0.4224 - acc: 0.7906 - val_loss: 0.6286 - val_acc: 0.7095
Epoch 10/23
841/841 - 480s - loss: 0.4163 - acc: 0.7922 - val_loss: 0.6273 - val_acc: 0.7088
Epoch 11/23
841/841 - 468s - loss: 0.4162 - acc: 0.7916 - val_loss: 0.6314 - 

In [18]:
from keras.callbacks import ModelCheckpoint

# Define the checkpoint callback
checkpoint_path = "best_spam_model.h5"
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_acc', save_best_only=True, mode='max', verbose=1)

# Train the model with checkpointing
history = model.fit(X_train, y_train, 
                    batch_size=32, 
                    epochs=25, 
                    validation_data=(X_test, y_test), 
                    callbacks=[checkpoint], 
                    verbose=2)

Epoch 1/25
841/841 - 489s - loss: 0.3924 - acc: 0.8059 - val_loss: 0.6444 - val_acc: 0.7021

Epoch 00001: val_acc improved from -inf to 0.70207, saving model to best_spam_model.h5
Epoch 2/25
841/841 - 489s - loss: 0.3897 - acc: 0.8069 - val_loss: 0.6852 - val_acc: 0.7080

Epoch 00002: val_acc improved from 0.70207 to 0.70802, saving model to best_spam_model.h5
Epoch 3/25
841/841 - 484s - loss: 0.3897 - acc: 0.8093 - val_loss: 0.6397 - val_acc: 0.7056

Epoch 00003: val_acc did not improve from 0.70802
Epoch 4/25
841/841 - 486s - loss: 0.3904 - acc: 0.8088 - val_loss: 0.6279 - val_acc: 0.7079

Epoch 00004: val_acc did not improve from 0.70802
Epoch 5/25
841/841 - 478s - loss: 0.3876 - acc: 0.8112 - val_loss: 0.6539 - val_acc: 0.7019

Epoch 00005: val_acc did not improve from 0.70802
Epoch 6/25
841/841 - 487s - loss: 0.3868 - acc: 0.8093 - val_loss: 0.6580 - val_acc: 0.7088

Epoch 00006: val_acc improved from 0.70802 to 0.70876, saving model to best_spam_model.h5
Epoch 7/25
841/841 - 493s

In [19]:
from keras.models import load_model

def predict_sarcasm(sentence):
    model = load_model("best_spam_model.h5")  # Load the saved model
    x_final = pd.DataFrame({"headline": [sentence]})
    
    test_lines = CleanTokenize(x_final)  # Clean and tokenize input text
    test_sequences = tokenizer_obj.texts_to_sequences(test_lines)
    test_review_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    
    pred = model.predict(test_review_pad)[0][0] * 100  # Get prediction score
    return "Spam!" if pred >= 50 else "Not Spam."

In [29]:
import pickle

with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer_obj, handle)

print("‚úÖ Tokenizer saved as tokenizer.pkl!")

‚úÖ Tokenizer saved as tokenizer.pkl!


In [36]:
try:
    model = load_model("best_spam_model.h5", compile=False)
    model.save("best_spam_model.keras", save_format="keras")
    print("Model loaded successfully!")
except Exception as e:
     print("Error loading¬†model:" , e)

Model loaded successfully!


In [41]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to C:\Users\user/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [51]:
os.system(f"cd huggingface_model_repo && git add . && git commit -m 'Upload trained model and tokenizer' && git push origin main")

1

In [52]:
from huggingface_hub import HfApi
import shutil
import os

repo_name = "Ahmad1020/sarcasm-spam-detector"  # Your Hugging Face repo
local_repo_path = "./huggingface_model_repo"

# Authenticate and create repo (if not already created)
api = HfApi()
api.create_repo(repo_name, exist_ok=True)

# Clone the repo manually using git
#os.system(f"git clone https://huggingface.co/{repo_name} {local_repo_path}")

# Ensure the repo exists
os.makedirs(local_repo_path, exist_ok=True)

# Copy model and tokenizer files
model_path = "best_spam_model.keras"
tokenizer_path = "tokenizer.pkl"

shutil.copy(model_path, f"{local_repo_path}/best_spam_model.keras")
shutil.copy(tokenizer_path, f"{local_repo_path}/tokenizer.pkl")

# Push changes using direct git commands
os.system(f"cd {local_repo_path} && git add . && git commit -m 'Upload trained model and tokenizer' && git push")

1

In [58]:
import tensorflow as tf
import pickle
import requests
import os

# Hugging Face repository URL
repo_url = "https://huggingface.co/Ahmad1020/sarcasm-spam-detector/resolve/main/"

# Define filenames
model_filename = "best_spam_model.keras"
tokenizer_filename = "tokenizer.pkl"

# Download model file
if not os.path.exists(model_filename):
    print("Downloading model...")
    model_response = requests.get(repo_url + model_filename)
    with open(model_filename, "wb") as f:
        f.write(model_response.content)

# Load the model
model = tf.keras.models.load_model(model_filename)
print("Model loaded successfully!")

# Download tokenizer file
if not os.path.exists(tokenizer_filename):
    print("Downloading tokenizer...")
    tokenizer_response = requests.get(repo_url + tokenizer_filename)
    with open(tokenizer_filename, "wb") as f:
        f.write(tokenizer_response.content)

# Load the tokenizer
with open(tokenizer_filename, "rb") as f:
    tokenizer = pickle.load(f)

print("Tokenizer loaded successfully!")

# Test the model
sample_text = ["This is an example spam message."]
sequences = tokenizer.texts_to_sequences(sample_text)
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=25)

prediction = model.predict(padded)[0][0]
label = "Spam" if prediction > 0.5 else "Not Spam"

print(f"Prediction: {label} (Confidence: {prediction:.4f})")


Model loaded successfully!
Tokenizer loaded successfully!
Prediction: Not Spam (Confidence: 0.4694)


In [61]:
import os
import pickle
import pandas as pd
import requests
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ÿ™ÿπÿ∑ŸäŸÑ ÿßŸÑŸÄ GPU ŸÑÿ™ÿ¨ŸÜÿ® ŸÖÿ¥ÿßŸÉŸÑ CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Hugging Face repository URL
repo_url = "https://huggingface.co/Ahmad1020/sarcasm-spam-detector/resolve/main/"

# Define filenames
model_filename = "best_spam_model.keras"
tokenizer_filename = "tokenizer.pkl"
max_length = 25  # Ÿäÿ¨ÿ® ÿ£ŸÜ ŸäŸÉŸàŸÜ ŸÜŸÅÿ≥ max_length ÿßŸÑŸÖÿ≥ÿ™ÿÆÿØŸÖ ÿ£ÿ´ŸÜÿßÿ° ÿßŸÑÿ™ÿØÿ±Ÿäÿ®

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑŸÜŸÖŸàÿ∞ÿ¨ ÿ•ÿ∞ÿß ŸÑŸÖ ŸäŸÉŸÜ ŸÖŸàÿ¨ŸàÿØŸãÿß
if not os.path.exists(model_filename):
    print(f"Downloading {model_filename}...")
    model_response = requests.get(repo_url + model_filename)
    if model_response.status_code == 200:
        with open(model_filename, "wb") as f:
            f.write(model_response.content)
    else:
        raise Exception(f"Failed to download {model_filename}. Status code: {model_response.status_code}")

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑÿ™ŸàŸÉŸÜŸäÿ≤ÿ± ÿ•ÿ∞ÿß ŸÑŸÖ ŸäŸÉŸÜ ŸÖŸàÿ¨ŸàÿØŸãÿß
if not os.path.exists(tokenizer_filename):
    print(f"Downloading {tokenizer_filename}...")
    tokenizer_response = requests.get(repo_url + tokenizer_filename)
    if tokenizer_response.status_code == 200:
        with open(tokenizer_filename, "wb") as f:
            f.write(tokenizer_response.content)
    else:
        raise Exception(f"Failed to download {tokenizer_filename}. Status code: {tokenizer_response.status_code}")

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑŸÜŸÖŸàÿ∞ÿ¨
try:
    model = tf.keras.models.load_model(model_filename)
    print("‚úÖ Model loaded successfully!")
except Exception as e:
    raise Exception(f"Error loading model: {e}")

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑÿ™ŸàŸÉŸÜŸäÿ≤ÿ±
try:
    with open(tokenizer_filename, "rb") as f:
        tokenizer = pickle.load(f)
    print("‚úÖ Tokenizer loaded successfully!")
except Exception as e:
    raise Exception(f"Error loading tokenizer: {e}")

# ÿØÿßŸÑÿ© ÿ™ŸàŸÇÿπ ÿßŸÑÿ≥ÿÆÿ±Ÿäÿ©
def predict_sarcasm(sentence):
    if not isinstance(sentence, str) or not sentence.strip():
        return False  # ÿßŸÑÿ™ÿπÿßŸÖŸÑ ŸÖÿπ ÿßŸÑÿ•ÿØÿÆÿßŸÑ ÿ∫Ÿäÿ± ÿßŸÑÿµÿßŸÑÿ≠

    test_sequences = tokenizer.texts_to_sequences([sentence])
    test_review_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')

    pred = model.predict(test_review_pad)[0][0] * 100
    return pred >= 50  # ÿ•ÿ±ÿ¨ÿßÿπ True ÿ•ÿ∞ÿß ŸÉÿßŸÜ ÿßÿ≠ÿ™ŸÖÿßŸÑ¬†ÿßŸÑÿ≥ÿÆÿ±Ÿäÿ©¬†‚â•¬†50%

predict_sarcasm("what a fucken match!")

‚úÖ Model loaded successfully!
‚úÖ Tokenizer loaded successfully!


True

In [None]:
import os
import pickle
import requests
import tensorflow as tf
from fastapi import FastAPI
from pydantic import BaseModel
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ÿ™ÿπÿ∑ŸäŸÑ ÿßŸÑŸÄ GPU ŸÑÿ™ÿ¨ŸÜÿ® ŸÖÿ¥ÿßŸÉŸÑ CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# ÿ±ÿßÿ®ÿ∑ Hugging Face
REPO_URL = "https://huggingface.co/Ahmad1020/sarcasm-spam-detector/resolve/main/"

# ÿ≠ŸÅÿ∏ ÿßŸÑŸÖŸÑŸÅÿßÿ™ ŸÅŸä `/tmp/`
MODEL_PATH = "/best_spam_model.keras"
TOKENIZER_PATH = "/tokenizer.pkl"
MAX_LENGTH = 25  # Ÿäÿ¨ÿ® ÿ£ŸÜ ŸäŸÉŸàŸÜ ŸÜŸÅÿ≥ max_length ÿßŸÑŸÖÿ≥ÿ™ÿÆÿØŸÖ ÿ£ÿ´ŸÜÿßÿ° ÿßŸÑÿ™ÿØÿ±Ÿäÿ®

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑŸÖŸÑŸÅÿßÿ™ ÿ•ŸÑŸâ `/tmp/`
def download_file(filename, save_path):
    if not os.path.exists(save_path):
        print(f"Downloading {filename}...")
        response = requests.get(REPO_URL + filename)
        if response.status_code == 200:
            with open(save_path, "wb") as f:
                f.write(response.content)
        else:
            raise Exception(f"Failed to download {filename}. Status code: {response.status_code}")

download_file("best_spam_model.keras", MODEL_PATH)
download_file("tokenizer.pkl", TOKENIZER_PATH)

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑŸÜŸÖŸàÿ∞ÿ¨
model = tf.keras.models.load_model(MODEL_PATH)

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑÿ™ŸàŸÉŸÜŸäÿ≤ÿ±
with open(TOKENIZER_PATH, "rb") as f:
    tokenizer = pickle.load(f)

# ÿ•ŸÜÿ¥ÿßÿ° FastAPI
app = FastAPI()

# ŸÜŸÖŸàÿ∞ÿ¨ ÿßŸÑÿ•ÿØÿÆÿßŸÑ
class TextInput(BaseModel):
    text: str

# ÿØÿßŸÑÿ© ÿßŸÑÿ™ŸÜÿ®ÿ§
def predict_sarcasm(sentence):
    test_sequences = tokenizer.texts_to_sequences([sentence])
    test_review_pad = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post')
    pred = model.predict(test_review_pad)[0][0] * 100
    return {"text": sentence, "sarcasm": pred >= 50, "confidence": round(pred, 2)}

# ŸÜŸÇÿ∑ÿ© ÿßŸÑŸÜŸáÿßŸäÿ© API
@app.post("/predict")
def predict(data: TextInput):
    return predict_sarcasm(data.text)
predict_sarcasm("what a fucken match!")

In [65]:
import os
import pickle
import tensorflow as tf
from fastapi import FastAPI
from pydantic import BaseModel
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ÿ™ÿπÿ∑ŸäŸÑ ÿßŸÑŸÄ GPU ŸÑÿ™ÿ¨ŸÜÿ® ŸÖÿ¥ÿßŸÉŸÑ CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# ŸÖÿ≥ÿßÿ±ÿßÿ™ ÿßŸÑŸÖŸÑŸÅÿßÿ™ ÿßŸÑŸÖÿÆÿ≤ŸÜÿ© ŸÖÿ≠ŸÑŸäŸãÿß
MODEL_PATH = "best_spam_model.keras"
TOKENIZER_PATH = "tokenizer.pkl"
MAX_LENGTH = 25  # Ÿäÿ¨ÿ® ÿ£ŸÜ ŸäŸÉŸàŸÜ ŸÜŸÅÿ≥ max_length ÿßŸÑŸÖÿ≥ÿ™ÿÆÿØŸÖ ÿ£ÿ´ŸÜÿßÿ° ÿßŸÑÿ™ÿØÿ±Ÿäÿ®

# ÿßŸÑÿ™ÿ≠ŸÇŸÇ ŸÖŸÖÿß ÿ•ÿ∞ÿß ŸÉÿßŸÜ ÿßŸÑŸÜŸÖŸàÿ∞ÿ¨ ŸàÿßŸÑÿ™ŸàŸÉŸÜŸäÿ≤ÿ± ŸÖŸàÿ¨ŸàÿØŸäŸÜ
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model file not found: {MODEL_PATH}")
if not os.path.exists(TOKENIZER_PATH):
    raise FileNotFoundError(f"Tokenizer file not found: {TOKENIZER_PATH}")

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑŸÜŸÖŸàÿ∞ÿ¨
print("‚úÖ Loading model...")
model = tf.keras.models.load_model(MODEL_PATH)

# ÿ™ÿ≠ŸÖŸäŸÑ ÿßŸÑÿ™ŸàŸÉŸÜŸäÿ≤ÿ±
print("‚úÖ Loading tokenizer...")
with open(TOKENIZER_PATH, "rb") as f:
    tokenizer = pickle.load(f)

# ÿ•ŸÜÿ¥ÿßÿ° FastAPI
app = FastAPI()

# ŸÜŸÖŸàÿ∞ÿ¨ ÿßŸÑÿ•ÿØÿÆÿßŸÑ
class TextInput(BaseModel):
    text: str

# ÿØÿßŸÑÿ© ÿßŸÑÿ™ŸÜÿ®ÿ§
def predict_sarcasm(sentence):
    test_sequences = tokenizer.texts_to_sequences([sentence])
    test_review_pad = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post')
    pred = model.predict(test_review_pad)[0][0] * 100
    return {"text": sentence, "sarcasm": pred >= 50, "confidence": round(pred, 2)}

# ŸÜŸÇÿ∑ÿ© ÿßŸÑŸÜŸáÿßŸäÿ© API
@app.post("/predict")
def predict(data: TextInput):
    return predict_sarcasm(data.text)


‚úÖ Loading model...
‚úÖ Loading tokenizer...


In [None]:
from huggingface_hub import hf_hub_download
from keras.models import load_model
import pickle

# Download the model
model_path = hf_hub_download(repo_id="your-username/sarcasm-spam-detector", filename="best_spam_model.keras")
model = load_model(model_path)

# Download the tokenizer
tokenizer_path = hf_hub_download(repo_id="your-username/sarcasm-spam-detector", filename="tokenizer.pkl")
with open(tokenizer_path, "rb") as handle:
    tokenizer_obj = pickle.load(handle)

print("Model and tokenizer loaded successfully!")


In [None]:
if not os.path.exists("best_spam_model.keras"):
     model = keras.models.load_model("best_spam_model.h5", compile=False)
# else:
#     model = keras.models.load_model("best_spam_model_updated.keras",¬†compile=False)

In [30]:
predict_sarcasm("what a fucken match!")



'Spam!'

In [22]:
predict_sarcasm("ŸÉÿßŸÜ ŸÖÿßÿ™ÿ¥ ÿ≥ŸáŸÑ Ÿà ÿÆÿ≥ÿ±ÿ™Ÿá ÿ®ÿ∫ÿ®ÿßÿ¶ŸÉ")



'Not Spam.'

In [23]:
predict_sarcasm("you looks a professional player")



'Not Spam.'

In [24]:
predict_sarcasm("well done bro")



'Spam!'

In [25]:
predict_sarcasm("ÿ¨ÿßŸÖÿØ ÿßŸàŸä ŸäÿßÿÆŸàŸäÿß Ÿáÿ≥ÿ™ŸÜŸâ ŸÖŸÜŸÉ ÿßŸÑÿßŸÅÿ∂ŸÑ")



'Not Spam.'

In [26]:
predict_sarcasm("ÿπÿ±ŸÅŸÜÿß ÿßŸÜŸÉ ÿ¨ÿßŸÖÿØ ŸäÿßÿπŸÖ")



'Not Spam.'

In [27]:
predict_sarcasm("what an idiot coach, he should give you time on the field")



'Spam!'

In [31]:
predict_sarcasm("ŸÉÿßŸÜ ŸÑÿßÿ≤ŸÖ ÿßŸÑŸÖÿØÿ±ÿ® ŸäÿØŸäŸÑŸÉ ŸàŸÇÿ™ŸÉ ŸÅŸä ÿßŸÑŸÖŸÑÿπÿ®")

'Not Spam.'

In [48]:
predict_sarcasm("ŸÉÿ≥ŸÖ ÿßŸÑÿßŸáŸÑŸä")

'Spam!'

In [49]:
predict_sarcasm("fuck elahly")

'Spam!'

In [50]:
predict_sarcasm("ÿßŸÑÿ≤ŸÖÿßŸÑŸÉ ÿπŸÖŸáŸÖ")

'Not Spam.'

In [51]:
predict_sarcasm("Elzamalek king of the play")

'Not Spam.'

In [1]:
import importlib
import pkg_resources

# ŸÖŸÉÿ™ÿ®ÿßÿ™ ÿßŸÑŸÖÿ¥ÿ±Ÿàÿπ
libraries = [
    "fastapi",
    "huggingface_hub",
    "keras",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pydantic",
    "requests",
    "scikit-learn",
    "tensorflow",
    "uvicorn",
    "os",
    "pickle",
    "re",
    "shutil",
    "string",
]

print("üîç Checking library versions...\n")
requirements = []

for lib in libraries:
    try:
        version = pkg_resources.get_distribution(lib).version
        print(f"{lib} == {version}")
        requirements.append(f"{lib}=={version}")
    except pkg_resources.DistributionNotFound:
        try:
            importlib.import_module(lib)
            print(f"{lib} (built-in or no version info)")
        except ImportError:
            print(f"{lib} ‚ùå not installed")

# ÿ•ŸÜÿ¥ÿßÿ° ŸÖŸÑŸÅ requirements.txt
with open("requirements.txt", "w", encoding="utf-8") as f:
    for line in requirements:
        f.write(line + "\n")

print("\n‚úÖ Done! File 'requirements.txt' created successfully.")


  import pkg_resources


üîç Checking library versions...

fastapi == 0.116.1
huggingface_hub == 0.29.2
keras == 3.10.0
matplotlib == 3.9.4
nltk == 3.9.1
numpy == 1.24.3
pandas == 2.2.3
pydantic == 2.11.7
requests == 2.32.4
scikit-learn == 1.6.1
tensorflow == 2.19.0
uvicorn == 0.34.2
os (built-in or no version info)
pickle (built-in or no version info)
re (built-in or no version info)
shutil (built-in or no version info)
string (built-in or no version info)

‚úÖ Done! File 'requirements.txt' created successfully.
