Importing Essential Libraries

In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns

Using Graphics Card 

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


Importing Dataset

In [3]:

dataset = pd.read_csv('romanized_nepali_dataset.csv')

# Display first few rows
dataset.head()


Unnamed: 0,rating,label,Romanized Review
0,5,1.0,product Ramro xw
1,4,0.0,Yo earbuds le noise cancellation vaneko jastai...
2,5,1.0,steel jhos ko jasto kam nagarni rahicha. 1/2 t...
3,4,1.0,"Nice quality, 6 dekhi 60 kg samma raixa babal ..."
4,4,0.0,Size perfect fit bhayo ani comfort pani top cl...


Checking and Removing The Null values

In [4]:
# Check for null values
print(dataset.isnull().sum())

# Drop or fill missing data as necessary
dataset.dropna(inplace = True)

rating              3
label               4
Romanized Review    5
dtype: int64


In [5]:
#Counting how many reviews are real and how many are fake
dataset.value_counts('label')

label
1.0    2535
0.0    2408
Name: count, dtype: int64

In [6]:
dataset.value_counts('rating')

rating
5    2729
4     992
3     510
1     386
2     326
Name: count, dtype: int64

Converting to lowercase

In [7]:
dataset['Romanized Review'] = dataset['Romanized Review'].str.lower()

In [8]:
dataset.tail()

Unnamed: 0,rating,label,Romanized Review
4943,3,0.0,"gaming ko lagi kinya, tara performance moderat..."
4944,3,1.0,thorai damage raixa aru ta sab thikxa
4945,2,0.0,product use garepaxi dherai asha gare jasto re...
4946,4,0.0,"tel le kapaal chamkilo vayo 💇‍♀️❤️, tara chip-..."
4947,5,0.0,the size is so big for me and i trying to but🥲


Converting emojis to their text equivalent

In [9]:
import emoji

def convert_emojis_to_text(text):
    return emoji.demojize(text, delimiters=(" ", " ")) 

dataset['text_'] = dataset['Romanized Review'].apply(convert_emojis_to_text)

In [10]:
dataset.tail()

Unnamed: 0,rating,label,Romanized Review,text_
4943,3,0.0,"gaming ko lagi kinya, tara performance moderat...","gaming ko lagi kinya, tara performance moderat..."
4944,3,1.0,thorai damage raixa aru ta sab thikxa,thorai damage raixa aru ta sab thikxa
4945,2,0.0,product use garepaxi dherai asha gare jasto re...,product use garepaxi dherai asha gare jasto re...
4946,4,0.0,"tel le kapaal chamkilo vayo 💇‍♀️❤️, tara chip-...",tel le kapaal chamkilo vayo woman_getting_hai...
4947,5,0.0,the size is so big for me and i trying to but🥲,the size is so big for me and i trying to but ...


Normalize the text

In [11]:
import re

def normalize_text(text):
    
    text = re.sub(r'\bu\b', 'timi', text)  
    text = re.sub(r'\bm\b', 'ma', text)
    text = re.sub(r'\beka\b', 'ek', text)

    # Remove extra spaces and normalize spacing
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [12]:
dataset['text_'] = dataset['text_'].apply(lambda X : normalize_text(X))

Removes noise from the dataset and only keep alphabets and spaces.

In [13]:
# Keep only alphabets and spaces

def remove_noise(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'nga', '', text)
     
    return text

dataset['text_'] = dataset['text_'].apply(lambda X: remove_noise(X))

In [14]:
dataset.tail()

Unnamed: 0,rating,label,Romanized Review,text_
4943,3,0.0,"gaming ko lagi kinya, tara performance moderat...",gaming ko lagi kinya tara performance moderate...
4944,3,1.0,thorai damage raixa aru ta sab thikxa,thorai damage raixa aru ta sab thikxa
4945,2,0.0,product use garepaxi dherai asha gare jasto re...,product use garepaxi dherai asha gare jasto re...
4946,4,0.0,"tel le kapaal chamkilo vayo 💇‍♀️❤️, tara chip-...",tel le kapaal chamkilo vayo womangettinghaircu...
4947,5,0.0,the size is so big for me and i trying to but🥲,the size is so big for me and i trying to but ...


In [15]:
dataset.drop('Romanized Review', axis =1)

Unnamed: 0,rating,label,text_
0,5,1.0,product ramro xw
1,4,0.0,yo earbuds le noise cancellation vaneko jastai...
2,5,1.0,steel jhos ko jasto kam nagarni rahicha time ...
3,4,1.0,nice quality dekhi kg samma raixa babal xa l...
4,4,0.0,size perfect fit bhayo ani comfort pani top cl...
...,...,...,...
4943,3,0.0,gaming ko lagi kinya tara performance moderate...
4944,3,1.0,thorai damage raixa aru ta sab thikxa
4945,2,0.0,product use garepaxi dherai asha gare jasto re...
4946,4,0.0,tel le kapaal chamkilo vayo womangettinghaircu...


Handle Slangs and common typing styles

In [16]:
# Step 3: Handling Slang (Manual slang replacements)
def handle_slang(text):
    slang_dict = {'thikkk': 'thik', 
        'ghamta': 'samta', 
        'farkera': 'pachhi',
        'xa': 'cha',
        'hoina': 'haina',
        'k': 'ke',
        'khoi': 'kahaan',
        'kati': 'kati',
        'k garne': 'ke garne',
        'thaxa': 'thaha',
        'thaxaina': 'thaha chaina',
        'kya': 'kya ho',
        'la': 'la',
        'hait': 'hait',
        'dherai': 'dherai',
        'ali': 'ali',
        'kasto': 'kasto',
        'k cha': 'ke cha',
        'kura': 'kura',
        'khate': 'khate',
        'dai': 'dai',
        'didi': 'didi',
        'bhai': 'bhai',
        'bahini': 'bahini',
        'kukurrr': 'kukur',
        'jastooo': 'jasto',
        'testooo': 'testo',
        'thank uh':'thank you',
        'yesto': 'yesto',
        'kinaaaa': 'kina',
        'hunchaaaa': 'huncha',
        'hunnaa': 'hunna',
        'pugyooo': 'pugyo',
        'pugenaaa': 'pugena',
        'khaana': 'khaana',
        'khayo': 'khayo',
        'khana': 'khana',
        'bas': 'bas',
        'chhito': 'chhito',
        'bholi': 'bholi',
        'aaja': 'aaja',
        'parla': 'parla',
        'pardaina': 'pardaina',
        'thik': 'thik',
        'thikai': 'thikai',
        'ramro': 'ramro',
        'naramro': 'naramro',
        'khatra': 'khatra',
        'halka': 'halka',
        'maile': 'maile',
        'timi': 'timi',
        'huss': 'huss',
        'guff': 'guff',
        'jhyau': 'jhyau',
        'khuro': 'khuro',
        'thulo': 'thulo',
        'sano': 'sano',
        'khaire': 'khaire',
        'jholeyy': 'jholey',
        'fuchhey': 'fuchhey',
        'khatey': 'khatey',
        'boka': 'boka',
        'bokey': 'bokey',
        'bokeycha': 'bokeycha',
        'bokeko': 'bokeko',
        'bokera': 'bokera',
        'bokne': 'bokne',
        'boknu': 'boknu',
        'boknus': 'boknus',
        'boknuparne': 'boknuparne',
        'boknuparyo': 'boknuparyo',
        'boknuparcha': 'boknuparcha',
        'xa' : 'cha',
        'khai': 'malai tha xaina',
        'gr8': 'great',
        'bro': 'bhai',
        'thik xa': 'thik cha',
        'k xa': 'k cha',
        'ke cha' :'k cha',
        'momo': 'dumpling',
    }
    for slang, standard in slang_dict.items():
        text = text.replace(slang, standard)
    return text

dataset['text_'] = dataset['text_'].apply(lambda X : handle_slang(X))


Remove Stopwords using custom dictionary

In [17]:
# Step 4: Stopword Removal (Example list of Nepali stopwords)
stopwords = ['ra', 'ko', 'le', 'lai', 'bata', 'xa', 'yo', 'tiyo', 'mero', 'maile', 'ma', 'lagi', 'mana', 'malai', 'ho', 'tara', 'pani', 'chan', 'garna', 'hunxa', 'of', 'a', 'an', 'the', 'is', 'and', 'but' ]
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text


dataset['text_'] = dataset['text_'].apply(lambda X : remove_stopwords(X))

In [18]:
dataset.tail()

Unnamed: 0,rating,label,Romanized Review,text_
4943,3,0.0,"gaming ko lagi kinya, tara performance moderat...",gaming keo keinya performance moderate jasto l...
4944,3,1.0,thorai damage raixa aru ta sab thikxa,thorai damage raicha aru ta sab thikecha
4945,2,0.0,product use garepaxi dherai asha gare jasto re...,product use garepaxi dherai asha gare jasto re...
4946,4,0.0,"tel le kapaal chamkilo vayo 💇‍♀️❤️, tara chip-...",tel keapaal chamkeilo vayo womangettinghaircut...
4947,5,0.0,the size is so big for me and i trying to but🥲,size so big for me i trying to smilingfacewith...


Tokenize the text 

In [19]:
def tokenize(text):
    return text.split()

dataset['text_'] = dataset['text_'].apply(lambda X : tokenize(X))

dataset.head()

Unnamed: 0,rating,label,Romanized Review,text_
0,5,1.0,product ramro xw,"[product, ramro, xw]"
1,4,0.0,yo earbuds le noise cancellation vaneko jastai...,"[earbuds, noise, cancellation, vanekeo, jastai..."
2,5,1.0,steel jhos ko jasto kam nagarni rahicha. 1/2 t...,"[steel, jhos, keo, jasto, keam, nagarni, rahic..."
3,4,1.0,"nice quality, 6 dekhi 60 kg samma raixa babal ...","[nice, quality, dekehi, keg, samma, raicha, ba..."
4,4,0.0,size perfect fit bhayo ani comfort pani top cl...,"[size, perfect, fit, bhayo, ani, comfort, top,..."


Create a custom lemmatization dictionary and perform lemmatization

In [20]:
import re

# Step 1: Define the Lemmatizer Dictionary for common words and their lemma
lemmatizer_dict = {
    'gardaichha': 'garnu',
    'garchha': 'garnu',
    'garera': 'garnu',
    'garne': 'garnu',
    'bhayeko': 'bhayeko',
    'jane': 'jan',
    'huncha': 'hunu',
    'hune': 'hunu',
    'pugne': 'pugnu',
    'garne': 'garnu',
    'chha': 'cha',
    'aune': 'aunu',
    'jane': 'jan',
    'dekhe': 'dekhnus',
    'garaune': 'garnu',
    'jaane': 'jan'
}

# Step 2: Regular Expression-based Lemmatization
def lemmatize(text):
    # Ensure the text is a string
    if isinstance(text, list):
        text = ' '.join(text)  # If the input is a list, join it into a single string

    # Step 2.1: Check against the lemmatizer dictionary
    words = text.split()
    lemmatized_words = []

    for word in words:
        # If the word is in the lemmatizer dictionary, replace it with the lemma
        if word in lemmatizer_dict:
            lemmatized_words.append(lemmatizer_dict[word])
        else:
            lemmatized_words.append(word)  # If no lemma exists, keep the original word

    # Step 2.2: Return the lemmatized version
    return ' '.join(lemmatized_words)

# Now apply lemmatize to your text column
dataset['processed_review'] = dataset['text_'].apply(lambda X: lemmatize(X))

In [21]:
dataset.head()

Unnamed: 0,rating,label,Romanized Review,text_,processed_review
0,5,1.0,product ramro xw,"[product, ramro, xw]",product ramro xw
1,4,0.0,yo earbuds le noise cancellation vaneko jastai...,"[earbuds, noise, cancellation, vanekeo, jastai...",earbuds noise cancellation vanekeo jastai garc...
2,5,1.0,steel jhos ko jasto kam nagarni rahicha. 1/2 t...,"[steel, jhos, keo, jasto, keam, nagarni, rahic...",steel jhos keo jasto keam nagarni rahicha time...
3,4,1.0,"nice quality, 6 dekhi 60 kg samma raixa babal ...","[nice, quality, dekehi, keg, samma, raicha, ba...",nice quality dekehi keg samma raicha babal cha...
4,4,0.0,size perfect fit bhayo ani comfort pani top cl...,"[size, perfect, fit, bhayo, ani, comfort, top,...",size perfect fit bhayo ani comfort top class w...


Split dataset text and rating into training and testing sets

In [22]:
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(dataset["processed_review"], dataset["label"], test_size=0.2, random_state=1, stratify=dataset["label"])

TFIDF Vectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [24]:
X_train_tfidf

<3954x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 111333 stored elements in Compressed Sparse Row format>

Word2Vec 

In [25]:
# Word2Vec Embedding

from gensim.models import Word2Vec

sentences = [text.split() for text in X_train]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

def average_word2vec(text):
    words = text.split()
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)  # Zero vector if no words match
    return np.mean(vectors, axis=0)

X_train_w2v = np.array([average_word2vec(text) for text in X_train])
X_test_w2v = np.array([average_word2vec(text) for text in X_test])

In [26]:
X_train_w2v

array([[-0.11514264,  0.34175622,  0.02273031, ..., -0.41152459,
         0.09297229,  0.06410618],
       [-0.32309684,  0.22271901, -0.14286081, ..., -0.47102675,
         0.21550563,  0.03968368],
       [-0.11376347,  0.24792767,  0.1220002 , ..., -0.20411305,
         0.07368836,  0.14238346],
       ...,
       [-0.11051338,  0.24680045,  0.18422087, ..., -0.20606014,
         0.091702  ,  0.1881862 ],
       [-0.23812899,  0.24134588, -0.10892683, ..., -0.44460633,
         0.19263744, -0.0078159 ],
       [-0.29174533,  0.41067517,  0.161809  , ..., -0.23117507,
         0.05549596,  0.23291051]])

Load and Use Pretrained Fasttext Model

In [27]:
#Load Pretrained FastText Model

import fasttext
import fasttext.util

fasttext.util.download_model('ne', if_exists='ignore')  # Download Nepali FastText model
ft_model = fasttext.load_model('cc.ne.300.bin')  # Load model

def average_fasttext(text):
    words = text.split()
    vectors = [ft_model.get_word_vector(word) for word in words if word in ft_model.words]
    if len(vectors) == 0:
        return np.zeros(300)  # Zero vector if no words match
    return np.mean(vectors, axis=0)

X_train_ft = np.array([average_fasttext(text) for text in X_train])
X_test_ft = np.array([average_fasttext(text) for text in X_test])

In [28]:
X_train_ft

array([[-0.01170602,  0.02333124, -0.00843021, ..., -0.00347104,
         0.00573593, -0.0063713 ],
       [-0.00385318,  0.02083142, -0.01071056, ..., -0.00464389,
        -0.00233964, -0.00465374],
       [-0.0170165 ,  0.01208591,  0.00191716, ..., -0.00072931,
        -0.00562109, -0.00662479],
       ...,
       [-0.00275244, -0.02989968, -0.01880171, ..., -0.01733332,
        -0.00747007,  0.00697602],
       [-0.00684421,  0.02396445, -0.01229795, ...,  0.00201699,
         0.00830247, -0.007886  ],
       [-0.00350618,  0.01967069, -0.01896983, ...,  0.0037038 ,
        -0.00218768,  0.00381984]])

Use hstack to combine TFIDF , Word2Vec and Fasttext

In [29]:
from scipy.sparse import hstack

# Combine TF-IDF, Word2Vec, and FastText Features
X_train_combined = hstack([X_train_tfidf, X_train_w2v, X_train_ft])
X_test_combined = hstack([X_test_tfidf, X_test_w2v, X_test_ft])

In [30]:
X_train_combined

<3954x5400 sparse matrix of type '<class 'numpy.float64'>'
	with 1675633 stored elements in COOrdinate format>

Train a SVM Model

In [31]:
# Train SVM Model

from sklearn.svm import SVC

svm_model = SVC(kernel="linear", class_weight="balanced", probability=True)
svm_model.fit(X_train_combined, y_train)

Evaluate the SVM Model

In [None]:
# Evaluate Model

accuracy = svm_model.score(X_test_combined, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
cm = confusion_matrix(X_test_combined, y_test)

# Get the unique class labels
class_names = sorted(list(set(X_test_combined)))

# Create a heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap for Logistic Regression')
plt.show()

Test Accuracy: 0.8150


Train Using Random Forrest Classifier

In [33]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train_combined, y_train)

In [34]:
accuracy = rf_model.score(X_test_combined, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7149


Train Using Logistic Regression Model

In [35]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_combined, y_train)

In [36]:
accuracy = lr_model.score(X_test_combined, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8038


Saving the pretrained model using Joblib

In [37]:
import joblib

# Save other models as before
joblib.dump(svm_model, "roman_svm_model.pkl")
joblib.dump(tfidf_vectorizer, "roman_tfidf_vectorizer.pkl")
joblib.dump(word2vec_model, "roman_word2vec_model.pkl")
joblib.dump(rf_model, "roman_rf_model.pkl")
joblib.dump(lr_model, "roman_lr_model.pkl")

# Save FastText Model Separately
ft_model.save_model("roman_fasttext_model.bin")

print("All models saved successfully!")

All models saved successfully!
