In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
nl = pd.read_csv('train.csv')
nl.head()

Unnamed: 0,comment,date
0,مثل همیشه عالی مخصوصا برگر ایتالیایی,2021-10-10 16:15:06
1,اقا شاید من هیچ قاشق و چنگالی نداشته باشم! بای...,2021-04-23 18:01:53
2,خیلی خوشمزه بود ای کاش تنوع بیشتری بدین حلوا ر...,2021-05-31 04:18:20
3,خوب بود فقط نسبت به مبلغ مرغ بریان اندازه اون ...,2021-04-04 12:56:17
4,به موقع و خوب فقط پیک اجناس را تا درب واحد بال...,2021-04-09 14:35:00


In [3]:
nl.shape

(56000, 2)

In [4]:
nl.isnull().count()

comment    56000
date       56000
dtype: int64

In [3]:
import nltk
nltk.download('stopwords')
nltk_stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
with open('stopwords_plus.txt') as stopwords_file:
    stopwords = stopwords_file.readlines()

stopwords_plus = [line.replace('\n', '') for line in stopwords]

In [7]:
stopwords_plus.extend(nltk_stopwords)


In [8]:
len(stopwords_plus)

1543

In [5]:
import hazm
stemmer = hazm.Stemmer()

In [6]:
from hazm import word_tokenize

In [11]:
dataset = pd.DataFrame(columns=('comments','date'))

for index, row in nl.iterrows():
    comment = row['comment']
    comment_tokenized = word_tokenize(comment)
    comment_tokenized_filtered = [w for w in comment_tokenized if not w in stopwords_plus]
    comment_tokenized_filtered_stemmed = [stemmer.stem(w) for w in comment_tokenized_filtered]
    dataset.loc[index] = {
        'comments': ' '.join(comment_tokenized_filtered_stemmed),
        'date': row['date'].replace('\n', '')}

In [14]:
vectorizer = TfidfVectorizer()
vectorizer.fit(dataset['comments'])

TfidfVectorizer()

In [15]:
X = vectorizer.transform(dataset['comments'])

In [16]:
kmeans = KMeans(n_clusters = 2).fit(X)

In [17]:
target = kmeans.labels_

In [18]:
t = pd.DataFrame(target)
t.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, t)

In [23]:
nb = MultinomialNB().fit(X_train, y_train)
nb.score(X_test, y_test)

  return f(**kwargs)


0.8932142857142857

In [24]:
y_pred = nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     12345
           1       0.77      0.14      0.23      1655

    accuracy                           0.89     14000
   macro avg       0.83      0.57      0.59     14000
weighted avg       0.88      0.89      0.86     14000



In [25]:
clf = LogisticRegression().fit(X_train, y_train)
clf.score(X_test, y_test)


  return f(**kwargs)


0.9965

In [26]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12345
           1       1.00      0.97      0.99      1655

    accuracy                           1.00     14000
   macro avg       1.00      0.99      0.99     14000
weighted avg       1.00      1.00      1.00     14000

