In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams
from collections import Counter


data = pd.read_csv('Swardspeak.csv')
data

Unnamed: 0,Swardspeak,English_Meaning,Tagalog_Meaning
0,achay,follower/maid,alalay/taga-sunod
1,aketch,me,ako
2,akirachiramira,beautiful woman,maganda babae
3,akis,me,ako
4,alaska,to tease,lokohin/asarin/tuksuhin
...,...,...,...
408,witchibeng,no/not/nope,hindi
409,wititit,no/not/nope,hindi
410,wiz,no/not/nope,hindi
411,yumoyolanda,Rain,Ulan


In [2]:
# Assuming 'Swardspeak', 'English_Meaning', and 'Tagalog_Meaning' are the correct column names
X = data['Swardspeak'].astype(str)  # Ensure the text data is in string format
y_english = data['English_Meaning']
y_tagalog = data['Tagalog_Meaning']

# Split the data into training and testing sets
X_train, X_test, y_english_train, y_english_test, y_tagalog_train, y_tagalog_test = train_test_split(
    X, y_english, y_tagalog, test_size=0.2)

# Convert text data to TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Train a Support Vector Machine (SVM) for Tagalog meanings
svm_tagalog = SVC(kernel='linear')
svm_tagalog.fit(X_train_tfidf, y_tagalog_train)

# Train a Support Vector Machine (SVM) for English meanings
svm_english = SVC(kernel='linear')
svm_english.fit(X_train_tfidf, y_english_train)

In [3]:
# Assuming 'Swardspeak', 'English_Meaning', and 'Tagalog_Meaning' are the correct column names
# Rest of your code remains unchanged


# Assuming 'Swardspeak', 'English_Meaning', and 'Tagalog_Meaning' are the correct column names
X = data['Swardspeak'].astype(str)  # Ensure the text data is in string format
y_english = data['English_Meaning']
y_tagalog = data['Tagalog_Meaning']

# Tokenize each entry into bigrams
tokenizer = RegexpTokenizer(r'\w+')
X_bigrams = X.apply(lambda x: " ".join([" ".join(bigram) for bigram in ngrams(tokenizer.tokenize(x.lower()), 2)]))

# Split the data into training and testing sets for bigrams
X_train_bigrams, X_test_bigrams, y_english_train_bigrams, y_english_test_bigrams, y_tagalog_train_bigrams, y_tagalog_test_bigrams = train_test_split(
    X_bigrams, y_english, y_tagalog, test_size=0.2)

# Convert text data to TF-IDF representation for bigrams
tfidf_vectorizer_bigrams = TfidfVectorizer()
X_train_tfidf_bigrams = tfidf_vectorizer_bigrams.fit_transform(X_train_bigrams)

# Train a Support Vector Machine (SVM) for Tagalog meanings using bigrams
svm_tagalog_bigrams = SVC(kernel='linear')
svm_tagalog_bigrams.fit(X_train_tfidf_bigrams, y_tagalog_train_bigrams)

# Train a Support Vector Machine (SVM) for English meanings using bigrams
svm_english_bigrams = SVC(kernel='linear')
svm_english_bigrams.fit(X_train_tfidf_bigrams, y_english_train_bigrams)

In [4]:
# Example of using the trained models
input_sentence = (f"Huy, sinetch itong chismakers kasi napagbigyan ni mumshie? Charot, napaka echoserang palaka talaga nito. Tara na nga besh.")

# Tokenize the input sentence
tokenizer = RegexpTokenizer(r'\w+')
input_tokens = tokenizer.tokenize(input_sentence.lower())

# Keep track of detected Swardspeak words and their meanings
detected_swardspeak_meanings = []
swardspeak_counter = 0

# Check if each token is in the dataset
for token in input_tokens:
    if token in X.values:
        input_sentence_tfidf = tfidf_vectorizer.transform([token])
        predicted_tagalog_meaning = svm_tagalog.predict(input_sentence_tfidf)
        predicted_english_meaning = svm_english.predict(input_sentence_tfidf)
        detected_swardspeak_meanings.append((token, predicted_tagalog_meaning[0], predicted_english_meaning[0]))
        swardspeak_counter += 1


if detected_swardspeak_meanings:
    # Print detected Swardspeak words and their meanings
    for swardspeak, tagalog_meaning, english_meaning in detected_swardspeak_meanings:
        print(f"Input Sentence: {input_sentence}")
        print(f"Detected Swardspeak: {swardspeak}")
        print(f"Tagalog Meaning: {tagalog_meaning}")
        print(f"English Meaning: {english_meaning}")
        print("---")
        
    # Print the Swardspeak counter
    print(f"Swardspeak Counter: {swardspeak_counter}")
else:
    print(f"No Swardspeak words found")

# Tokenize the input sentence into bigrams
input_tokens_bigrams = [" ".join(bigram) for bigram in ngrams(tokenizer.tokenize(input_sentence.lower()), 2)]

# Keep track of detected Swardspeak words and their meanings for bigrams
detected_swardspeak_meanings_bigrams = []
swardspeak_counter_bigrams = 0

# Check if each bigram is in the dataset
for bigram_str in input_tokens_bigrams:
    if bigram_str in X_bigrams.values:
        input_sentence_tfidf_bigrams = tfidf_vectorizer_bigrams.transform([bigram_str])
        predicted_tagalog_meaning_bigrams = svm_tagalog_bigrams.predict(input_sentence_tfidf_bigrams)
        predicted_english_meaning_bigrams = svm_english_bigrams.predict(input_sentence_tfidf_bigrams)
        detected_swardspeak_meanings_bigrams.append((bigram_str, predicted_tagalog_meaning_bigrams[0], predicted_english_meaning_bigrams[0]))
        swardspeak_counter_bigrams += 1

if detected_swardspeak_meanings_bigrams:
    # Print detected Swardspeak words and their meanings for bigrams
    for swardspeak, tagalog_meaning, english_meaning in detected_swardspeak_meanings_bigrams:
        print("---")
        print(f"Input Sentence: {input_sentence}")
        print(f"Detected Swardspeak: {swardspeak}")
        print(f"Tagalog Meaning: {tagalog_meaning}")
        print(f"English Meaning: {english_meaning}")
        print("---")
        print(f"Swardspeak Counter (Bigrams): {swardspeak_counter_bigrams}")
else:
    print(f"No Swardspeak words found (Bigrams)")

Input Sentence: Huy, sinetch itong chismakers kasi napagbigyan ni mumshie? Charot, napaka echoserang palaka talaga nito. Tara na nga besh.
Detected Swardspeak: sinetch
Tagalog Meaning: sino
English Meaning: who
---
Input Sentence: Huy, sinetch itong chismakers kasi napagbigyan ni mumshie? Charot, napaka echoserang palaka talaga nito. Tara na nga besh.
Detected Swardspeak: chismakers
Tagalog Meaning: tsismosa
English Meaning: gossiper
---
Input Sentence: Huy, sinetch itong chismakers kasi napagbigyan ni mumshie? Charot, napaka echoserang palaka talaga nito. Tara na nga besh.
Detected Swardspeak: mumshie
Tagalog Meaning: ina/nanay
English Meaning: mother
---
Input Sentence: Huy, sinetch itong chismakers kasi napagbigyan ni mumshie? Charot, napaka echoserang palaka talaga nito. Tara na nga besh.
Detected Swardspeak: charot
Tagalog Meaning: biro lang
English Meaning: just kidding
---
Input Sentence: Huy, sinetch itong chismakers kasi napagbigyan ni mumshie? Charot, napaka echoserang palaka