## Importing Libraries

In [2]:
from nest_asyncio import apply
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from hazm import Normalizer, word_tokenize, stopwords_list
import  pandas as pd
import re
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sqlalchemy import create_engine, text
from sklearn.decomposition import LatentDirichletAllocation
import seaborn as sns
import matplotlib.pyplot as plt

## Getting Data from Database 

In [4]:
# user = 'user'
# password = 'password'
# host = 'host'
# port = 'port'
# service_name ='srvice_name'
# 
# dsn = f'oracle+oracledb://{user}:{password}@{host}:{port}/?service_name={service_name}'
# engine = create_engine(dsn)
# 
# sql_transaction_image = f"""
#     SELECT ti.ID ,ti.OCR_TEXT ,t.TERMINAL_ID1
#     FROM "TRANSACTION" t 
#     JOIN TRANSACTION_IMAGE ti ON t.ID = ti.TRANSACTION_ID 
#     JOIN TERMINAL t2 ON t2.ID = t.TERMINAL_ID1 
#     JOIN MERCHANT m ON m.ID = t2.MERCHANT_ID 
#     WHERE   ti.TYPE_ID IN (1,2) AND ti.OCR_TEXT IS NOT NULL AND m.STATUS != 'V' 
#     ORDER BY ti.ID ASC
# """
# with engine.connect() as conn:
#     dataset_main = pd.read_sql(sql=sql_transaction_image, con=conn)


NameError: name 'create_engine' is not defined

In [3]:
df=pd.read_csv(r"C:\Users\s.heydarian\Desktop\dashboard data\TRANSACTION_DESCRIPTION_202507071458.csv")

## Data cleaning

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
def remove_all_numbers(text):
    return re.sub(r'[0-9۰-۹]', '', text)

In [None]:
df = df[~df["NAME"].str.isdigit()]

In [None]:
def remove_all_english(text):
    return re.sub(r'[a-zA-z]', ''  , text)

In [None]:
def remove_emojis_and_symbols(text):
    emoji_symbol_pattern = re.compile("["
                                      u"\U0001F600-\U0001F64F"  # Emoticons
                                      u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                                      u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                                      u"\U0001F700-\U0001F77F"  # Alphanumeric & geometric shapes
                                      u"\U0001F780-\U0001F7FF"  # Geometric shapes extended
                                      u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                      u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                      u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                                      u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                                      u"\U00002702-\U000027B0"  # Dingbats
                                      u"\U000024C2-\U0001F251" 
                                      r"\W"  # Non-word characters (symbols)
                                      "]+", flags=re.UNICODE)

    clean_text = re.sub(emoji_symbol_pattern, ' ', text)

    return clean_text

In [None]:
df['name']=df['name'].apply(remove_emojis_and_symbols)

In [None]:
df['name']=df['name'].apply(remove_all_english)

In [None]:
df['name']=df['name'].apply(remove_all_numbers)


In [None]:
df['name'] = (
    df['name']
    .str.replace(r'["\r\n]', '', regex=True)      # Remove quotes, carriage return, newlines
    .str.replace(r',,', '', regex=True)           # Remove double commas
    .str.replace(r'\s+', ' ', regex=True)         # Collapse multiple spaces
    .str.strip()                                  # Remove leading/trailing whitespace
)

In [None]:
df = df[df['DESCRIPTION'].str.strip() != '']

## Preprocessing

In [None]:
# Persian stopwords

custom_stopwords  = {"کد", "جهت","برداشت", "کارت", "مبلغ", "توسط" ,  "شماره",  "بمبلغ", "رهگيري" ,  "بانک", "مهرايران", "ما" ,  "جغطائی", "الحسنه" , "مدرن", "الحسنه", "رهگیری" , "الحسنه", "قرض"}

all_stopwords = set(stopwords_list()).union(custom_stopwords)

normalizer = Normalizer()

def preprocess(text):
    text = normalizer.normalize(text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in all_stopwords and len(t) > 1]
    return ' '.join(tokens)  # For TF-IDF, we need a string


In [None]:
df['name'] = df['name'].apply(preprocess)

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=0.75)
doc_term_matrix = tfidf_vectorizer.fit_transform(df['name'])

print(f"TF-IDF Matrix: {doc_term_matrix.shape}")


In [None]:
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
doc_topic_matrix = lda.fit_transform(doc_term_matrix)

topic_names = [f"Topic {i+1}" for i in range(num_topics)]
doc_topic_df = pd.DataFrame(doc_topic_matrix, columns=topic_names)
print(doc_topic_df.head())

In [None]:
num_words = 10
feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f"\n Topic {topic_idx + 1}")
    top_indices = topic.argsort()[::-1][:num_words]
    for i in top_indices:
        print(f"   {feature_names[i]} ({topic[i]:.3f})")

In [None]:
# ========== Step 7: Perplexity vs. Topic Count ==========
perplexities = []
topic_counts = range(2, 11)

for k in topic_counts:
    lda_k = LatentDirichletAllocation(n_components=k, random_state=42)
    lda_k.fit(doc_term_matrix)
    perp = lda_k.perplexity(doc_term_matrix)
    perplexities.append(perp)

# ========== Step 8: Plot Perplexity ==========
plt.figure(figsize=(8, 5))
sns.lineplot(x=topic_counts, y=perplexities, marker='o')
plt.title('Perplexity by Topic Count')
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity')
plt.grid(True)
plt.tight_layout()
plt.show()