## Import Libraries

In [96]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


## Data Loading

In [97]:
pozitive_comments = pd.read_csv("pozitiveComments.csv")
negative_comments = pd.read_csv("negativeComments.csv")

## Data Exploration

In [98]:
df = pd.concat([pozitive_comments,negative_comments])

In [99]:
df.head()

Unnamed: 0,Yorum,Duygu
0,Çok iyi. Tesekkurler..,1
1,çok iyi ve hızlı bir ürün,1
2,teşekkürler...,1
3,Gerçekten kullanmasını bilene çok iyi orta seg...,1
4,Kargo yavaş gelmedi şansıma 3 gün sonra yola ç...,1


In [100]:
df.shape

(977, 2)

In [101]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Duygu,977.0,0.757421,0.428862,0.0,1.0,1.0,1.0,1.0


## Text Preprocessing

In [102]:
#lower-case
df['Yorum'] = df['Yorum'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#punctuation
df['Yorum'] = df['Yorum'].str.replace('[^\w\s]','')
#numbers
df['Yorum'] = df['Yorum'].str.replace('\d','')
#stopwords
sw = stopwords.words('turkish')
df['Yorum'] = df['Yorum'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
# rare words
remove = pd.Series(" ".join(df["Yorum"]).split()).value_counts()[-1000:]
df["Yorum"]=df["Yorum"].apply(lambda x: " ".join(i for i in x.split() if i not in remove))


## Train-Test-Split

In [103]:
X_train, X_test, y_train, y_test = train_test_split(df["Yorum"],data["Duygu"], test_size = 0.33, random_state = 42)

## Label Encoder

In [104]:
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

## Count Vectors

In [105]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

CountVectorizer()

In [106]:
x_train_count = vectorizer.transform(X_train)
x_test_count = vectorizer.transform(X_test)

In [107]:
vectorizer.get_feature_names()[0:5]

['abim', 'abime', 'ablama', 'absürd', 'acele']

## TF-IDF

In [108]:
tf_idf = TfidfVectorizer()
tf_idf.fit(X_train)

TfidfVectorizer()

In [109]:
x_train_tf_idf = tf_idf.transform(X_train)
x_test_tf_idf = tf_idf.transform(X_test)

In [110]:
tf_idf.get_feature_names()[0:5]

['abim', 'abime', 'ablama', 'absürd', 'acele']

# Machine Learning Models

### 1- Logistic Regression

#### a) for count vectorizer

In [111]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, y_train)

y_pred = loj_model.predict(x_test_count)

In [116]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.29      0.45        80
           1       0.81      1.00      0.90       243

    accuracy                           0.82       323
   macro avg       0.91      0.64      0.67       323
weighted avg       0.86      0.82      0.78       323



#### b) for tf-idf

In [117]:
loj_model = loj.fit(x_train_tf_idf,y_train)

In [118]:
y_pred = loj_model.predict(x_test_tf_idf)

In [119]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.29      0.45        80
           1       0.81      1.00      0.90       243

    accuracy                           0.82       323
   macro avg       0.91      0.64      0.67       323
weighted avg       0.86      0.82      0.78       323



### 2- NaiveBayes

#### a) for count vectorizer

In [120]:
nb = MultinomialNB()
nb_model = nb.fit(x_train_count,y_train)

In [121]:
y_pred = nb_model.predict(x_test_count)

In [122]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.59      0.73        80
           1       0.88      0.99      0.93       243

    accuracy                           0.89       323
   macro avg       0.92      0.79      0.83       323
weighted avg       0.90      0.89      0.88       323



#### b) for tf-idf

In [123]:
nb_model = nb.fit(x_train_tf_idf,y_train)

In [124]:
y_pred = nb_model.predict(x_test_tf_idf)

In [125]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.19      0.32        80
           1       0.79      1.00      0.88       243

    accuracy                           0.80       323
   macro avg       0.89      0.59      0.60       323
weighted avg       0.84      0.80      0.74       323



### 3-Random Forest

#### a) for count vectorizer

In [126]:
rf = RandomForestClassifier()
rf_model = rf.fit(x_train_count,y_train)

In [127]:
y_pred = rf_model.predict(x_test_count)

In [128]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.47      0.63        80
           1       0.85      0.99      0.92       243

    accuracy                           0.86       323
   macro avg       0.90      0.73      0.77       323
weighted avg       0.88      0.86      0.85       323



#### b) for tf-idf

In [129]:
rf_model = rf.fit(x_train_tf_idf,y_train)

In [130]:
y_pred = rf_model.predict(x_test_tf_idf)

In [131]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.49      0.65        80
           1       0.86      1.00      0.92       243

    accuracy                           0.87       323
   macro avg       0.92      0.74      0.79       323
weighted avg       0.88      0.87      0.85       323



### 4 - XGBOOST

#### a) for count vectorizer

In [132]:
xgboost = GradientBoostingClassifier()
xgboost_model = xgboost.fit(x_train_count,y_train)

In [133]:
y_pred = xgboost_model.predict(x_test_count)

In [134]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.54      0.68        80
           1       0.87      0.98      0.92       243

    accuracy                           0.87       323
   macro avg       0.89      0.76      0.80       323
weighted avg       0.88      0.87      0.86       323



#### b) for tf-idf

In [135]:
xgboost_model = xgboost.fit(x_train_tf_idf,y_train)

In [136]:
y_pred = xgboost_model.predict(x_test_tf_idf)

In [137]:
#accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.49      0.64        80
           1       0.85      0.99      0.92       243

    accuracy                           0.86       323
   macro avg       0.89      0.74      0.78       323
weighted avg       0.87      0.86      0.85       323

