# Sentiment Analyzer

In [13]:
import pandas as pd

## Reading the files using pandas (the datasets are reviews for restaurants on yelp, movies on IMDB and products on amazon )

In [14]:
filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)    
df = pd.concat(df_list)
print(df.iloc[0])                 

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


# Yelp
## splitting the data into test and train data

In [15]:
from sklearn.model_selection import train_test_split
df_yelp = df[df['source'] == 'yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

## getting the count vector for the sentences, the result shows the count vector for the train data

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

## Calculating the accuracy for the data from yelp

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy for yelp:", score)

Accuracy for yelp: 0.796


# Amazon
## splitting the data into test and train data

In [18]:
df_amazon = df[df['source'] == 'amazon']
sentences2 = df_amazon['sentence'].values
y2 = df_amazon['label'].values
sentences2_train, sentences2_test, y2_train, y2_test = train_test_split(sentences2, y2, test_size=0.25, random_state=1000)

## getting the count vector for the sentences, the result shows the count vector for the train data

In [19]:
vectorizer2 = CountVectorizer()
vectorizer2.fit(sentences2_train)
X2_train = vectorizer2.transform(sentences2_train)
X2_test  = vectorizer2.transform(sentences2_test)
X2_train

<750x1546 sparse matrix of type '<class 'numpy.int64'>'
	with 6817 stored elements in Compressed Sparse Row format>

## Calculating the accuracy for the data from amazon

In [20]:
classifier2 = LogisticRegression()
classifier2.fit(X2_train, y2_train)
score2 = classifier2.score(X2_test, y2_test)
print("Accuracy for amazon:", score2)

Accuracy for amazon: 0.796


# IMDB
## splitting the data into test and train data

In [21]:
df_imdb = df[df['source'] == 'imdb']
sentences3 = df_imdb['sentence'].values
y3 = df_imdb['label'].values
sentences3_train, sentences3_test, y3_train, y3_test = train_test_split(sentences3, y3, test_size=0.25, random_state=1000)

## getting the count vector for the sentences, the result shows the count vector for the train data

In [22]:
vectorizer3 = CountVectorizer()
vectorizer3.fit(sentences3_train)
X3_train = vectorizer3.transform(sentences3_train)
X3_test  = vectorizer3.transform(sentences3_test)
X3_train

<561x2505 sparse matrix of type '<class 'numpy.int64'>'
	with 8413 stored elements in Compressed Sparse Row format>

## Calculating the accuracy for the data from IMDB

In [23]:
classifier3 = LogisticRegression()
classifier3.fit(X3_train, y3_train)
score3 = classifier3.score(X3_test, y3_test)
print("Accuracy for IMDB:", score3)

Accuracy for IMDB: 0.7486631016042781
