In [2]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers


from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
import pandas as pd 
data = pd.read_csv("train.tsv",sep = "\t")
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
data["Sentiment"].replace(0, value = "negative", inplace = True)
data["Sentiment"].replace(1, value = "negative", inplace = True)

In [6]:
data["Sentiment"].replace(3, value = "pozitive", inplace = True)
data["Sentiment"].replace(4, value = "pozitive", inplace = True)

In [7]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negative
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [8]:
data = data[(data.Sentiment == "negative") | (data.Sentiment == "pozitive")]

In [9]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negative
21,22,1,good for the goose,pozitive
22,23,1,good,pozitive
33,34,1,"the gander , some of which occasionally amuses...",negative
46,47,1,amuses,pozitive


In [10]:
data.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,34345,34345,34345
pozitive,42133,42133,42133


In [11]:
df = pd.DataFrame()
df["text"] = data["Phrase"]
df["label"] = data["Sentiment"]

In [12]:
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,negative
21,good for the goose,pozitive
22,good,pozitive
33,"the gander , some of which occasionally amuses...",negative
46,amuses,pozitive


In [17]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aozde\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [20]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aozde\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

# Text Preprocessing

In [21]:
#big-small transformation
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#punctuation marks
df['text'] = df['text'].str.replace('[^\w\s]','')
#numbers
df['text'] = df['text'].str.replace('\d','')
#stopwords
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
#deletion of sparse
delete = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in delete))
#lemmi
from textblob import Word
#nltk.download('wordnet')
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

# Variable Engineering

* Count Vectors
* TF-IDF Vectors (words, characters, n-grams)
* Word Embeddings
TF(t) = (Frequency of occurrence of a t term in a document) / (total number of terms in the document)

IDF(t) = log_e(Total number of documents / number of documents with t term in it)

In [22]:
df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,negative
21,good goose,pozitive
22,good,pozitive
33,occasionally none amount much story,negative
46,,pozitive


In [23]:
df.iloc[0]

text     series demonstrating adage good goose also goo...
label                                             negative
Name: 0, dtype: object

# Test-Train

In [24]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["text"],
                                                                   df["label"], 
                                                                    random_state = 1)

In [25]:
train_y[0:5]

118788    pozitive
89514     negative
86857     pozitive
140626    negative
153243    pozitive
Name: label, dtype: object

In [26]:
encoder = preprocessing.LabelEncoder()

In [27]:
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [28]:
train_y[0:5]

array([1, 0, 1, 0, 1])

In [29]:
test_y[0:5]

array([1, 0, 1, 0, 0])

# Count Vectors

In [33]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

CountVectorizer()

In [34]:
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

In [36]:
x_train_count

<57358x12690 sparse matrix of type '<class 'numpy.int64'>'
	with 280522 stored elements in Compressed Sparse Row format>

In [37]:
vectorizer.get_feature_names()[0:5]

['aaliyah', 'abagnale', 'abandon', 'abandoned', 'abbass']

In [38]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# TF-IDF

In [None]:
#wordlevel

In [39]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

TfidfVectorizer()

In [40]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

In [41]:
tf_idf_word_vectorizer.get_feature_names()[0:5]

['aaliyah', 'abagnale', 'abandon', 'abandoned', 'abbass']

In [42]:
x_train_tf_idf_word.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# ngram level tf-idf

In [43]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3))
tf_idf_ngram_vectorizer.fit(train_x)

TfidfVectorizer(ngram_range=(2, 3))

In [44]:
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

In [45]:
# characters level tf-idf

In [46]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3))
tf_idf_chars_vectorizer.fit(train_x)

TfidfVectorizer(analyzer='char', ngram_range=(2, 3))

In [48]:
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

# Sentiment Classification with Machine Learning

# Lojistik Regresyon

In [52]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy rate:", accuracy)

Count Vectors Accuracy rate: 0.837918410041841


In [53]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy rate:", accuracy)

Word-Level TF-IDF Accuracy rate: 0.8336297071129707


In [54]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy rate:", accuracy)

N-GRAM TF-IDF Accuracy rate: 0.748378661087866


In [56]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy rate:", accuracy)

CHARLEVEL Accuracy rate: 0.7811715481171548


# Naive Bayes

In [57]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy rate:", accuracy)

Count Vectors Accuracy rate: 0.83331589958159


In [58]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF  Accuracy rate:", accuracy)

Word-Level TF-IDF  Accuracy rate: 0.8353033472803346


In [59]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF  Accuracy rate:", accuracy)

N-GRAM TF-IDF  Accuracy rate: 0.7686192468619246


In [60]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL  Accuracy rate:", accuracy)

CHARLEVEL  Accuracy rate: 0.7551255230125523


# Random Forests

In [61]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy rate :", accuracy)

Count Vectors Accuracy rate : 0.8228033472803347


In [62]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy rate:", accuracy)

Word-Level TF-IDF Accuracy rate: 0.8252092050209205


In [63]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy rate:", accuracy)

N-GRAM TF-IDF Accuracy rate: 0.748378661087866


In [64]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy rate:", accuracy)

CHARLEVEL Accuracy rate: 0.7811715481171548


# XGBoost

In [65]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy rate:", accuracy)

Count Vectors Accuracy rate: 0.7165271966527197


In [66]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy rate:", accuracy)

Word-Level TF-IDF Accuracy rate: 0.7094142259414227


In [67]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy rate:", accuracy)

N-GRAM TF-IDF Accuracy rate: 0.5827405857740585


In [68]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy rate:", accuracy)

CHARLEVEL Accuracy rate: 0.7781380753138075


In [69]:
loj_model

LogisticRegression()

In [70]:
loj_model.predict("yes i like this film")

ValueError: Expected 2D array, got scalar array instead:
array=yes i like this film.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
new_comment = pd.Series("this film is very nice and good i like it")

new_comment = pd.Series("no not good look at that shit very bad")

In [None]:
v = CountVectorizer()
v.fit(train_x)
new _comment = v.transform(new_comment)

In [None]:
loj_model.predict(new_comment)