In [114]:
import fasttext
import pandas as pd
import numpy as np
from tqdm import tqdm
import pandas as pd
import hazm
import string
from sklearn.model_selection import train_test_split
import csv
import itertools

In [None]:
# Load the training dataset from 'final_preprocessed.csv'
train_dataset = pd.read_csv('final_preprocessed.csv')

In [78]:
# Display dataset information
train_dataset.pop('Unnamed: 0')
train_dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132102 entries, 0 to 132101
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   comment    132102 non-null  object
 1   sentiment  132102 non-null  object
 2   Cleaned    132102 non-null  object
dtypes: object(3)
memory usage: 3.0+ MB


In [79]:
# Display dataset statistics
train_dataset.describe()


Unnamed: 0,comment,sentiment,Cleaned
count,132102,132102,132102.0
unique,132099,3,131608.0
top,نسبت به قیمتش خوبه,negative,
freq,2,64631,316.0


In [81]:
# Check the number of samples per sentiment category
print(list(train_dataset['sentiment'] == 'negative').count(True))
print(list(train_dataset['sentiment'] == 'positive').count(True))
print(list(train_dataset['sentiment'] == 'neutral').count(True))

64631
63918
3553


In [82]:
# Check for null values in the 'Cleaned' column
print(train_dataset['Cleaned'].isnull().sum())
print(train_dataset['Cleaned'].isna().sum())

0
0


In [83]:
# Get unique sentiment categories
categories = train_dataset['sentiment'].unique().tolist()
categories

['negative', 'positive', 'neutral']

In [84]:
# Create a new column 'sentiment_fasttext' and initialize it to None
train_dataset['sentiment_fasttext'] = None


In [None]:
# # preprocessing part
# # Load data
# data = pd.read_csv('first_file.csv')

# # List of Persian punctuations
# punctuations = string.punctuation + "٬" + "،"

# # Create a translator object to remove punctuations
# translator = str.maketrans('', '', punctuations)

# # List of Persian stop words
# stopwords = hazm.stopwords_list()

# # Create a lemmatizer object
# lem = hazm.Lemmatizer()

# # Create a normalizer object
# norm = hazm.Normalizer()

# # Create an empty DataFrame for preprocessing
# preprocessed_dataset = pd.DataFrame(columns=['comment', 'sentiment'])

# # Iterate over the data and preprocess
# for index, row in data.iterrows():
#     # Tokenize and lemmatize the text
#     text = row['comment']
#     text_tokenized = hazm.word_tokenize(text)
#     text_lem = [lem.lemmatize(x) for x in text_tokenized]

#     # Normalize the text
#     text_norm = [norm.normalize(x) for x in text_lem]

#     # Remove stop words
#     clean_text = [x for x in text_norm if not x in stopwords]

#     # Remove punctuations
#     final_text = [x.translate(translator) for x in clean_text]

#     # Add the preprocessed text to the preprocessed_dataset
#     preprocessed_dataset.loc[index] = ({
#         'comment': ' '.join(final_text),
#         'sentiment': row['sentiment']
#     })

In [85]:
# Add "__label__" to each comment for FastText training
for index, p in tqdm(enumerate(train_dataset['Cleaned'])):
  cat = train_dataset['sentiment'][index]
  label = '__label__'+ str(cat)
  new_sentiment = label + " " + str(p)
  train_dataset['sentiment_fasttext'][index] = new_sentiment

132102it [00:20, 6302.12it/s]


In [None]:
# Display the updated dataset
train_dataset

In [87]:
#Tokenize a sample comment using FastText's tokenize function
fasttext.tokenize(train_dataset['sentiment_fasttext'][7])

['__label__negative',
 'کیفیت',
 'غذا',
 'متوسط',
 'رو',
 'به',
 'پایین',
 'بود',
 'انگار',
 'داخل',
 'یه',
 'رستوران',
 'معمولی',
 'غذا',
 'خوردی',
 'درحالی',
 'که',
 'امتیاز',
 'رستوران',
 'در',
 'اسنپ',
 'فود',
 '4٫3',
 'بود']

In [88]:
# Split the dataset into train and test sets
X_train, X_test = train_test_split(train_dataset, test_size=0.2, random_state=1, shuffle=True)

In [89]:
# Display the sizes of the train and test sets
print(len(X_train))
print(len(X_test))

105681
26421


In [90]:
# Export the 'sentiment_fasttext' column as a .txt file for FastText
X_train['sentiment_fasttext'].to_csv('train.txt',
                                     sep=' ',
                                     index=False,
                                     header=False,
                                     quoting=csv.QUOTE_NONE,
                                     quotechar="",
                                     escapechar=" ")

X_test['sentiment_fasttext'].to_csv('test.txt',
                                    sep=' ',
                                    index=False,
                                    header=False,
                                    quoting=csv.QUOTE_NONE,
                                    quotechar="",
                                    escapechar=" ")

In [113]:
# Define a grid of hyperparameters for FastText
grid = {
    'lr': [0.1, 0.3, 0.5, 0.7, 0.9],
    'epoch': [100, 200, 300],
    'wordNgrams': [1, 2, 3, 4, 5]
}

# Generate all combinations of hyperparameters
all_names = grid.keys()
combinations = list(itertools.product(*(grid[Name] for Name in all_names)))

best_score = 0
best_params = {}

# Iterate over hyperparameter combinations
for combination in combinations:
    params = dict(zip(all_names, combination))
    
    # Train a FastText model with the current hyperparameters
    model = fasttext.train_supervised('train.txt',
                                      lr=params['lr'],
                                      epoch=params['epoch'],
                                      wordNgrams=params['wordNgrams'],
                                      loss='ns',
                                      seed=123,
                                      label='__label__')
    
    # Test the model on the test set
    result = model.test('test.txt')
    precision = result[1]
    
    # Update the best score and best parameters if needed
    if precision > best_score:
        best_score = precision
        best_params = params

# Display the best hyperparameters and score
print(f'Best score: {best_score}')
print(f'Best parameters: {best_params}')

Best score: 0.8637068998145415
Best parameters: {'lr': 0.1, 'epoch': 100, 'wordNgrams': 4}


In [115]:
# Train a FastText model with the best hyperparameters
model = fasttext.train_supervised('train.txt',
                                  wordNgrams=4,
                                  lr=0.1,
                                  epoch=100,
                                  loss='ns',
                                  seed=123,
                                  label='__label__')

# Test the model on the test set
model.test('test.txt')

(26421, 0.8629877748760456, 0.8629877748760456)

In [116]:
# Save the trained model
model.save_model('fasttext_with_tune_data_model.bin')


In [117]:
# Load the saved model and test it again
loaded_model = fasttext.load_model('fasttext_with_tune_data_model.bin')
loaded_model.test('test.txt')

(26421, 0.8629877748760456, 0.8629877748760456)

In [127]:
# Load another test data for cheking prediction from 'total_preprocessed.csv'
test_data = pd.read_csv('total_preprocessed.csv')

# List of Persian punctuations
punctuations = string.punctuation + "٬" + "،"

# Create a translator object to remove punctuations
translator = str.maketrans('', '', punctuations)

# List of Persian stop words
stopwords = hazm.stopwords_list()

# Create a lemmatizer object
lem = hazm.Lemmatizer()

# Create a normalizer object
norm = hazm.Normalizer()

# Create an empty DataFrame for the test dataset
test_dataset = pd.DataFrame(columns=['local_id', 'Cleaned'])

# Iterate over the test data
for index, row in test_data.iterrows():
    # Tokenize and lemmatize the text
    text = row['Cleaned']
    text_tokenized = hazm.word_tokenize(text)
    text_lem = [lem.lemmatize(x) for x in text_tokenized]

    # Normalize the text
    text_norm = [norm.normalize(x) for x in text_lem]

    # Remove stop words
    clean_text = [x for x in text_norm if not x in stopwords]

    # Remove punctuations
    final_text = [x.translate(translator) for x in clean_text]

    # Add the preprocessed text to the test_dataset
    test_dataset.loc[index] = ({
        'Cleaned': ' '.join(final_text),
        'local_id': row['local_id']
    })

# Display the test dataset
test_dataset

In [129]:
# Define a function for batch prediction using the trained model
def bunch_predict(comments, model):
    labels = []
    probs = []

    for comment in comments:
        result = model.predict(comment)
        labels.append(result[0][0].replace('__label__', ''))
        probs.append(result[1][0])

    return labels, probs

In [130]:
# Get 'local_id' and 'Cleaned' columns from the test dataset
local_id = test_dataset['local_id']
comments = test_dataset['Cleaned'].to_list()

In [131]:
# Perform batch prediction on the test dataset
labels, probs = bunch_predict(comments, model)

# Create a new test dataset with predicted sentiments
new_test = {"local_id": [], "Cleaned": [], "sentiment": []}
n = 0

for label, comment in zip(labels, comments):
    new_test['Cleaned'].append(comment)
    new_test['sentiment'].append(label)
    new_test['local_id'].append(local_id[n])
    n = n + 1

# Create a DataFrame from the new test dataset
df = pd.DataFrame(new_test)

# Apply conditions to create new columns for each sentiment category
df['negative'] = df['sentiment'].apply(lambda x: 1 if x == "negative" else 0)
df['neutral'] = df['sentiment'].apply(lambda x: 1 if x == "neutral" else 0)
df['positive'] = df['sentiment'].apply(lambda x: 1 if x == "positive" else 0)


In [None]:
# Drop the 'sentiment' column
df.drop('sentiment', axis=1, inplace=True)

In [133]:
# Save the final test dataset to 'total_testing.csv'
df.to_csv('total_testing.csv')