# Sentiment Analizi ve Sınıflandırma Modelleri

In [1]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers


from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd 
data = pd.read_csv("train.tsv",sep = "\t")

In [3]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
data["Sentiment"].replace(0, value="negative", inplace=True)
data["Sentiment"].replace(1, value="negative", inplace=True)
#2 Notr

In [5]:
data["Sentiment"].replace(3, value="positive", inplace=True)
data["Sentiment"].replace(4, value="positive", inplace=True)

In [6]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negative
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
#Siniflandirmanin daha net olabilmesi adina notr ifadeler disarida birakilir

In [8]:
data = data[(data.Sentiment =="negative") | (data.Sentiment == "positive")]

In [9]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negative
21,22,1,good for the goose,positive
22,23,1,good,positive
33,34,1,"the gander , some of which occasionally amuses...",negative
46,47,1,amuses,positive


In [10]:
data.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,34345,34345,34345
positive,42133,42133,42133


In [11]:
df = pd.DataFrame()
df["text"]= data["Phrase"]
df["label"]= data["Sentiment"]

In [12]:
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,negative
21,good for the goose,positive
22,good,positive
33,"the gander , some of which occasionally amuses...",negative
46,amuses,positive


## Metin Ön İşleme

In [13]:
#buyuk-kucuk donusumu
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#noktalama işaretleri
df['text'] = df['text'].str.replace('[^\w\s]','')
#sayılar
df['text'] = df['text'].str.replace('\d','')
#stopwords
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
#seyreklerin silinmesi
sil = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
#lemmi
from textblob import Word
#nltk.download('wordnet')
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

## Değişken Mühendisliği

* Count Vectors
* TF-IDF Vectors (words, characters, n-grams)
* Word Embeddings

TF(t) = (Bir t teriminin bir dökümanda gözlenme frekansı) / (dökümandaki toplam terim sayısı) 

IDF(t) = log_e(Toplam döküman sayısı / içinde t terimi olan belge sayısı)

**Eldeki verilerin nümerik olarak temsil edilebilmesi için bu metinlerden bazı bilgiler çıkarılması ve nümerikleştirmeler yapılması gerekmektedir.**

In [14]:
df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,negative
21,good goose,positive
22,good,positive
33,gander occasionally amuses none amount much story,negative
46,amuses,positive


In [15]:
df.iloc[0]

text     series demonstrating adage good goose also goo...
label                                             negative
Name: 0, dtype: object

## Train - Test

In [16]:
X_train, X_test , y_train, y_test = model_selection.train_test_split(df["text"], df["label"], random_state=42)

In [17]:
X_train.head()

146523    explode obnoxiously screen something bubba hot...
125256    take care cat brings beguiling freshness comin...
38418                                precious little either
130028               girl learns believing something matter
30125                                             also rock
Name: text, dtype: object

In [18]:
encoder = preprocessing.LabelEncoder()

In [19]:
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [20]:
y_train[:10]

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1])

In [21]:
y_test[:10]

array([0, 1, 0, 1, 0, 1, 1, 0, 1, 1])

## Count Vectors

In [22]:
vectorizer = CountVectorizer()

In [23]:
vectorizer.fit(X_train)

CountVectorizer()

In [24]:
x_train_count = vectorizer.transform(X_train)
x_test_count = vectorizer.transform(X_test)

In [26]:
vectorizer.get_feature_names()[:5]

['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned']

In [27]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

##  TF- IDF

In [28]:
# Word Level

In [29]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(X_train)

TfidfVectorizer()

In [30]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(X_train)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(X_test)

In [31]:
tf_idf_word_vectorizer.get_feature_names()[:5]

['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned']

In [32]:
x_train_tf_idf_word.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
# Ngram Level tf-idf

In [34]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3))
tf_idf_ngram_vectorizer.fit(X_train)

TfidfVectorizer(ngram_range=(2, 3))

In [35]:
x_train_tf_idf_ngarm = tf_idf_ngram_vectorizer.transform(X_train)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(X_test)

In [36]:
# characters level tf-idf

In [37]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3))
tf_idf_chars_vectorizer.fit(X_train)

TfidfVectorizer(analyzer='char', ngram_range=(2, 3))

In [38]:
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(X_train)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(X_test)

# Makine Öğrenmesi ile Sentiment Sınıflandırması

## Logistic Regression

In [39]:
log_reg = linear_model.LogisticRegression()
log_reg_model = log_reg.fit(x_train_count, y_train)
accuracy = model_selection.cross_val_score(log_reg_model,
                                          x_test_count,
                                          y_test,
                                          cv=10).mean()
print("Count Vectors Accuracy: ", accuracy)

Count Vectors Accuracy:  0.8398012552301255


In [40]:
log_reg = linear_model.LogisticRegression()
log_reg_model = log_reg.fit(x_train_tf_idf_word, y_train)
accuracy = model_selection.cross_val_score(log_reg_model,
                                          x_test_tf_idf_word,
                                          y_test,
                                          cv=10).mean()
print("Word Level TF-IDF Accuracy: ", accuracy)

Word Level TF-IDF Accuracy:  0.8353033472803346


In [41]:
log_reg = linear_model.LogisticRegression()
log_reg_model = log_reg.fit(x_train_tf_idf_ngarm, y_train)
accuracy = model_selection.cross_val_score(log_reg_model,
                                          x_test_tf_idf_ngram,
                                          y_test,
                                          cv=10).mean()
print("Ngram Level TF-IDF Accuracy: ", accuracy)

Ngram Level TF-IDF Accuracy:  0.7463912133891213


In [43]:
log_reg = linear_model.LogisticRegression()
log_reg_model = log_reg.fit(x_train_tf_idf_chars, y_train)
accuracy = model_selection.cross_val_score(log_reg_model,
                                          x_test_tf_idf_chars,
                                          y_test,
                                          cv=10).mean()
print("Char Level TF-IDF Accuracy: ", accuracy)

Char Level TF-IDF Accuracy:  0.7802301255230126


## Naive Bayes

In [49]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,y_train)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_count, 
                                           y_test, 
                                           cv = 10).mean()

print("Count Vectors Accuracy: ", accuracy)

Count Vectors Accuracy:  0.8357217573221758


In [50]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,y_train)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_word, 
                                           y_test, 
                                           cv = 10).mean()

print("Word Level TF-IDF Accuracy: ", accuracy)

Word Level TF-IDF Accuracy:  0.8330020920502091


In [51]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngarm,y_train)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_ngram, 
                                           y_test, 
                                           cv = 10).mean()

print("Ngram Level TF-IDF Accuracy: ", accuracy)

Ngram Level TF-IDF Accuracy:  0.7682008368200837


In [52]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars,y_train)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_chars, 
                                           y_test, 
                                           cv = 10).mean()

print("Char Level TF-IDF Accuracy: ", accuracy)

Char Level TF-IDF Accuracy:  0.755857740585774


## Random Forests

In [55]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count,y_train)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_count, 
                                           y_test, 
                                           cv = 10).mean()

print("Count Vectors Accuracy: ", accuracy)

Count Vectors Accuracy:  0.8313284518828452


In [56]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_word ,y_train)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_word, 
                                           y_test, 
                                           cv = 10).mean()

print("Word Level TF-IDF Accuracy: ", accuracy)

Word Level TF-IDF Accuracy:  0.8286610878661088


In [None]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_ngarm ,y_train)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_ngram, 
                                           y_test, 
                                           cv = 10).mean()

print("Ngram Level TF-IDF Accuracy: ", accuracy)

In [None]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_chars ,y_train)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_chars, 
                                           y_test, 
                                           cv = 10).mean()

print("Char Level TF-IDF Accuracy: ", accuracy)

## XGBoost

In [58]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count,y_train)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_count, 
                                           y_test, 
                                           cv = 10).mean()

print("Count Vectors Accuracy: ", accuracy)

Count Vectors Accuracy:  0.7209205020920502


In [59]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_word,y_train)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_word, 
                                           y_test, 
                                           cv = 10).mean()

print("Word Level TF-IDF Accuracy: ", accuracy)

Word Level TF-IDF Accuracy:  0.7148535564853556


In [60]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_ngarm,y_train)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_ngram, 
                                           y_test, 
                                           cv = 10).mean()

print("Ngram Level TF-IDF Accuracy: ", accuracy)

Ngram Level TF-IDF Accuracy:  0.5903242677824267


In [61]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_chars,y_train)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_chars, 
                                           y_test, 
                                           cv = 10).mean()

print("Char Level TF-IDF Accuracy: ", accuracy)

Char Level TF-IDF Accuracy:  0.7788179916317992


In [None]:
log_reg

In [None]:
new_comment = pd.Series("this film is very nice and good i like it")

new_comment = pd.Series("no not good look at that shit very bad")

In [None]:
v = CountVectorizer()
v.fit(X_train)
new_comment = v.transform(new_comment)

In [None]:
log_reg.predict(new_comment)