# Metin Sınıflandırma Modeli


In [1]:
import nltk
import string
import xgboost
import textblob
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from textblob import TextBlob, Word

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, decomposition, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
df = pd.read_csv("veriseti2.csv", encoding='cp1254', header=None, names=["Name", "Brand", "Comment","Rate","Price"])
df.head()

Unnamed: 0,Name,Brand,Comment,Rate,Price
0,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,Ürün çok hızlı ve özenli paketlenmiş bir şekil...,5.0,7.499
1,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,çok sağlam şekilde paketlenmiş olarak geldi ga...,5.0,7.499
2,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,Kardeşime Satın Aldım Gayet Güzel ve Kullanışl...,5.0,7.499
3,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,ürün hemen geldi ve telefonla aradım kaya bey ...,5.0,7.499
4,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,ürün iki gün içimde elime ulaştı çok güzel çok...,5.0,7.499


In [3]:
df["Rate"].replace(1, value = "negatif", inplace = True)
df["Rate"].replace(2, value = "negatif", inplace = True)
df["Rate"].replace(4, value = "pozitif", inplace = True)
df["Rate"].replace(5, value = "pozitif", inplace = True)
df.head()

Unnamed: 0,Name,Brand,Comment,Rate,Price
0,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,Ürün çok hızlı ve özenli paketlenmiş bir şekil...,pozitif,7.499
1,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,çok sağlam şekilde paketlenmiş olarak geldi ga...,pozitif,7.499
2,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,Kardeşime Satın Aldım Gayet Güzel ve Kullanışl...,pozitif,7.499
3,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,ürün hemen geldi ve telefonla aradım kaya bey ...,pozitif,7.499
4,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,ürün iki gün içimde elime ulaştı çok güzel çok...,pozitif,7.499


In [4]:
df = df[(df["Rate"] == "negatif") | (df["Rate"] == "pozitif")]
df.head()

Unnamed: 0,Name,Brand,Comment,Rate,Price
0,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,Ürün çok hızlı ve özenli paketlenmiş bir şekil...,pozitif,7.499
1,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,çok sağlam şekilde paketlenmiş olarak geldi ga...,pozitif,7.499
2,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,Kardeşime Satın Aldım Gayet Güzel ve Kullanışl...,pozitif,7.499
3,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,ürün hemen geldi ve telefonla aradım kaya bey ...,pozitif,7.499
4,DMC N90x (GTA 5) I7-86016gb Ram256gb Ssd4gb Gt...,DMC,ürün iki gün içimde elime ulaştı çok güzel çok...,pozitif,7.499


In [5]:
df.groupby("Rate").count()


Unnamed: 0_level_0,Name,Brand,Comment,Price
Rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negatif,394,394,394,394
pozitif,4168,4167,4168,4168


In [6]:
df = df[["Comment","Rate"]]
df.columns = ["Text","Label"]
df.head()

Unnamed: 0,Text,Label
0,Ürün çok hızlı ve özenli paketlenmiş bir şekil...,pozitif
1,çok sağlam şekilde paketlenmiş olarak geldi ga...,pozitif
2,Kardeşime Satın Aldım Gayet Güzel ve Kullanışl...,pozitif
3,ürün hemen geldi ve telefonla aradım kaya bey ...,pozitif
4,ürün iki gün içimde elime ulaştı çok güzel çok...,pozitif


In [7]:
df.reset_index(inplace = True)
df.drop("index", axis = 1, inplace = True)
df.head()


Unnamed: 0,Text,Label
0,Ürün çok hızlı ve özenli paketlenmiş bir şekil...,pozitif
1,çok sağlam şekilde paketlenmiş olarak geldi ga...,pozitif
2,Kardeşime Satın Aldım Gayet Güzel ve Kullanışl...,pozitif
3,ürün hemen geldi ve telefonla aradım kaya bey ...,pozitif
4,ürün iki gün içimde elime ulaştı çok güzel çok...,pozitif


# 1. Metin Ön İşleme


In [8]:
df["Text"] = df["Text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
df["Text"] = df["Text"].str.replace("[^\w\s]","")
df["Text"] = df["Text"].str.replace("\d","")

sw = stopwords.words("english")
df["Text"] = df["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

sil = pd.Series(" ".join(df["Text"]).split()).value_counts()[-1000:]
df["Text"] = df["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
df["Text"] = df["Text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df.head()

Unnamed: 0,Text,Label
0,ürün çok hızlı özenli paketlenmiş bir şekilde ...,pozitif
1,çok sağlam şekilde paketlenmiş olarak geldi ga...,pozitif
2,kardeşime satın aldım gayet güzel kullanışlı a...,pozitif
3,ürün hemen geldi telefonla aradım kaya bey diy...,pozitif
4,ürün iki gün içimde elime ulaştı çok güzel çok...,pozitif


# 2. Değişken Mühendisliği


# A. Train & Test


In [9]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["Text"], df["Label"])

encoder = preprocessing.LabelEncoder()

train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

print(train_y[0:5])
print(test_y[0:5])

[1 1 0 0 1]
[1 1 1 1 1]


# B. Count Vectors


In [39]:
vectorizer = CountVectorizer().fit(train_x)
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)
feature_names = list(vectorizer.vocabulary_.keys())[0:5]
print(feature_names)



['özenli', 'kargo', 'tam', 'bir', 'fp']


In [40]:
x_train_count.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# C. TD-IDF


In [42]:
tf_idf_word_vectorizer = TfidfVectorizer().fit(train_x)
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)
feature_names = list(tf_idf_word_vectorizer.vocabulary_.keys())[0:5]
print(feature_names)


['özenli', 'kargo', 'tam', 'bir', 'fp']


In [43]:
x_train_tf_idf_word.toarray()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# D. N-GRAM Level TF-IDF


In [44]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3)).fit(train_x)
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)
feature_names = list(tf_idf_ngram_vectorizer.vocabulary_.keys())[0:5]
print(feature_names)



['özenli kargo', 'kargo tam', 'tam bir', 'bir fp', 'fp ürünü']


# E. Characters Level TF-IDF


In [45]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3)).fit(train_x)
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)
feature_names = list(tf_idf_chars_vectorizer.vocabulary_.keys())[0:5]
print(feature_names)





['öz', 'ze', 'en', 'nl', 'li']


# 3. Makine Öğrenmesi ile Sentiment Sınıflandırması


# A. Lojistik Regresyon


In [46]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [47]:
print("Count Vectors Doğruluk Oranı:", accuracy)


Count Vectors Doğruluk Oranı: 0.9368878718535469


In [48]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [49]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)


Word-Level TD-IDF Doğruluk Oranı: 0.9184973302822271


In [50]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    1.1s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.1s finished


In [51]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)


N-GRAM TF-IDF Doğruluk Oranı: 0.917620137299771


In [52]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_chars, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [53]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)


CHARLEVEL TF-IDF Doğruluk Oranı: 0.9184973302822271


# B. Naive Bayes


In [54]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [55]:
print("Count Vectors Doğruluk Oranı:", accuracy)


Count Vectors Doğruluk Oranı: 0.9237604881769641


In [56]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [57]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)


Word-Level TD-IDF Doğruluk Oranı: 0.917620137299771


In [58]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [59]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)


N-GRAM TF-IDF Doğruluk Oranı: 0.917620137299771


In [60]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [61]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)


CHARLEVEL TF-IDF Doğruluk Oranı: 0.917620137299771


# C. Random Forests


In [62]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.9s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.9s finished


In [63]:
print("Count Vectors Doğruluk Oranı:", accuracy)


Count Vectors Doğruluk Oranı: 0.9316399694889397


In [64]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.2s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.3s finished


In [65]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)


Word-Level TD-IDF Doğruluk Oranı: 0.9237528604118992


In [66]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   13.4s remaining:   13.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   13.8s finished


In [67]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)


N-GRAM TF-IDF Doğruluk Oranı: 0.9237528604118992


In [68]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.0s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.0s finished


In [69]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)


CHARLEVEL TF-IDF Doğruluk Oranı: 0.9325171624713958


# D. XGBoost


In [70]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.0s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.0s finished


In [71]:
print("Count Vectors Doğruluk Oranı:", accuracy)


Count Vectors Doğruluk Oranı: 0.9377726926010679


In [72]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.7s finished


In [73]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)

Word-Level TD-IDF Doğruluk Oranı: 0.9307627765064836


In [74]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 10).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    6.0s remaining:   14.2s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    6.1s remaining:    6.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    6.2s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.3s finished


In [75]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)


N-GRAM TF-IDF Doğruluk Oranı: 0.9193592677345537


In [76]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    4.5s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.7s finished


In [77]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)


CHARLEVEL TF-IDF Doğruluk Oranı: 0.9369183829138062
