<h1>Sentiment Analysis</h1>

<h2>1. Preprocessing</h2>

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , accuracy_score , f1_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
amazon = pd.read_csv('amazon_cells_labelled.txt',delimiter='\t',header=None)
amazon.columns = ['Sentence','Class']
amazon['index'] = amazon.index

columns = ['index','Class', 'Sentence']
temp = pd.DataFrame(columns=columns)
print(amazon.head())

                                            Sentence  Class  index
0  So there is no way for me to plug it in here i...      0      0
1                        Good case, Excellent value.      1      1
2                             Great for the jawbone.      1      2
3  Tied to charger for conversations lasting more...      0      3
4                                  The mic is great.      1      4


In [3]:
amazon['Sentence'] = amazon['Sentence'].str.replace('[^\w\s]','')
print(amazon.head())

                                            Sentence  Class  index
0  So there is no way for me to plug it in here i...      0      0
1                          Good case Excellent value      1      1
2                              Great for the jawbone      1      2
3  Tied to charger for conversations lasting more...      0      3
4                                   The mic is great      1      4


In [4]:
amazon['Sentence'] = amazon['Sentence'].str.lower()
print(amazon.head())

                                            Sentence  Class  index
0  so there is no way for me to plug it in here i...      0      0
1                          good case excellent value      1      1
2                              great for the jawbone      1      2
3  tied to charger for conversations lasting more...      0      3
4                                   the mic is great      1      4


And to omit extra white spaces pandas str.strip() is used. Of course it's not visible below but if the whole dataset is manually checked the result will be viewed.

In [5]:
amazon['Sentence'] = amazon['Sentence'].str.strip()
print(amazon.head())

                                            Sentence  Class  index
0  so there is no way for me to plug it in here i...      0      0
1                          good case excellent value      1      1
2                              great for the jawbone      1      2
3  tied to charger for conversations lasting more...      0      3
4                                   the mic is great      1      4


In [6]:
for index, row in amazon.iterrows():
    word_tokens = word_tokenize(row['Sentence'])
    filtered_sentence = [w for w in word_tokens if not w in stopwords.words('english')]
    temp = temp.append({"index": row['index'], "Class":  row['Class'],"Sentence": " ".join(filtered_sentence[0:])}, ignore_index=True)
amazon = temp
temp = pd.DataFrame(columns=columns)
print(amazon.head())

  index Class                                           Sentence
0     0     0                    way plug us unless go converter
1     1     1                          good case excellent value
2     2     1                                      great jawbone
3     3     0  tied charger conversations lasting 45 minutesm...
4     4     1                                          mic great


In [7]:
ps = PorterStemmer()

for index, row in amazon.iterrows():
    word_tokens = word_tokenize(row['Sentence'])
    stemmed_sentence = [ps.stem(w) for w in word_tokens]
    temp = temp.append({"index": row['index'], "Class":  row['Class'],"Sentence": " ".join(stemmed_sentence[0:])}, ignore_index=True)
amazon = temp
temp = pd.DataFrame(columns=columns)
print(amazon.head())

  index Class                                          Sentence
0     0     0                     way plug us unless go convert
1     1     1                              good case excel valu
2     2     1                                      great jawbon
3     3     0  tie charger convers last 45 minutesmajor problem
4     4     1                                         mic great


In [8]:
wl = WordNetLemmatizer()

for index, row in amazon.iterrows():
    word_tokens = word_tokenize(row['Sentence'])
    lemmatized_sentence = [wl.lemmatize(w) for w in word_tokens]
    temp = temp.append({"index": row['index'], "Class":  row['Class'],"Sentence": " ".join(lemmatized_sentence[0:])}, ignore_index=True)
amazon = temp
print(amazon.head())

  index Class                                          Sentence
0     0     0                      way plug u unless go convert
1     1     1                              good case excel valu
2     2     1                                      great jawbon
3     3     0  tie charger convers last 45 minutesmajor problem
4     4     1                                         mic great


<h2>2. Classification</h2>

In [9]:
x = amazon['Sentence'].values.astype('U')
y = amazon['Class'].values.astype('int32')
print(type(x))
print(type(y))

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42 )

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [10]:
vectorizer = TfidfVectorizer(analyzer='word') 
X_train_v = vectorizer.fit_transform(X_train)
X_test_v = vectorizer.transform(X_test)

In [11]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_v, y_train)
y_pred = naive_bayes.predict(X_test_v)

In [12]:
Logistic_Regression = LogisticRegression()
Logistic_Regression.fit(X_train_v, y_train)
y_pred_2 = Logistic_Regression.predict(X_test_v)

<h3>3. Evaluation & Comparison</h3>

In [13]:
print('Test Confusion Matrix:')
print(confusion_matrix(y_test,y_pred))
print('Test F1-Score: ', f1_score(y_test,y_pred))
print('Test Accuracy: ', accuracy_score(y_test,y_pred))
print('Test Error: ', 1-accuracy_score(y_test,y_pred))

Test Confusion Matrix:
[[75 18]
 [17 90]]
Test F1-Score:  0.8372093023255813
Test Accuracy:  0.825
Test Error:  0.17500000000000004


And here are the results obtained from Logistic Regression:

In [14]:
print('Test Confusion Matrix:')
print(confusion_matrix(y_test,y_pred_2))
print('Test F1-Score: ', f1_score(y_test,y_pred_2))
print('Test Accuracy: ', accuracy_score(y_test,y_pred_2))
print('Test Error: ', 1-accuracy_score(y_test,y_pred_2))

Test Confusion Matrix:
[[78 15]
 [25 82]]
Test F1-Score:  0.8039215686274509
Test Accuracy:  0.8
Test Error:  0.19999999999999996
