# Sentiments Analysis
## Based on comments
Data Source: https://archive.ics.uci.edu/dataset/331/sentiment+labelled+sentences

#### 1) Opening the data from files

In [1]:
files_folder = './data/'
files_paths = [files_folder + i for i in ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']]

#### 2) Putting the file data into lists

In [2]:
positive_reviews = []
negative_reviews = []

for file_path in files_paths:
    file = open(file_path, 'r')
    for line in file:
        suffix = -3
        if file_path == './data/imdb_labelled.txt':
            suffix = -5
        
        if int(line[-2]):
            line = line[:suffix]
            positive_reviews.append(line)
        else:
            line = line[:suffix]
            negative_reviews.append(line)

#### 3) Doing permutations

##### Calling library function

In [3]:
from itertools import permutations

##### Positive

In [4]:
positive_reviews_permutations = []

for i in range(2):
    permutations_list = [" ".join(k) for k in permutations(positive_reviews, i+1)]
    positive_reviews_permutations += permutations_list

del positive_reviews

##### Negative

In [5]:
negative_reviews_permutations = []

for i in range(2):
    permutations_list = [" ".join(k) for k in permutations(negative_reviews, i+1)]
    negative_reviews_permutations += permutations_list

del negative_reviews

#### 4) Building the databases (Train/Test)

##### Import library

In [12]:
import pandas as pd

##### Process

In [13]:
sentiments_dict = {
    'review': positive_reviews_permutations + negative_reviews_permutations,
    'sentiment': [1 for i in positive_reviews_permutations] + [0 for i in negative_reviews_permutations]
}

df = pd.DataFrame(sentiments_dict)
del sentiments_dict, positive_reviews_permutations, negative_reviews_permutations

In [14]:
df['sentiment'].value_counts()

sentiment
1    1687500
0    1687500
Name: count, dtype: int64

#### 5) Clear texts

##### Importing library function

In [17]:
from re import sub

##### Clear function

In [18]:
def clean_text(text):
    return text.lower()

##### Process

In [20]:
df['clean_review'] = df['review'].apply(clean_text)
df[['review', 'clean_review']].sample(3)

Unnamed: 0,review,clean_review
684135,"Nice, spicy and tender. The Greek dressing was...","nice, spicy and tender. the greek dressing was..."
2147202,The Ngage is still lacking in earbuds. So medi...,the ngage is still lacking in earbuds. so medi...
1419244,I'm a big fan of this series mostly due to Ann...,i'm a big fan of this series mostly due to ann...


#### 6) Stop Words Removal

##### Importing libraries

In [38]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/stnz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/stnz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /home/stnz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

##### Set up

In [39]:
usefull_words = {"no", "nor", "not", "don'", "don't", "ain", "aren", "aren't", 
                 "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't",
                 "hadn", "hadn't", "hasn", "hasn't", "haven't", "isn", "isn't",
                 "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't",
                 "shan", "shan't", "shan't", "shouldn", "shouldn't", "wasn",
                 "wasn't", "weren", "weren't", "won't", "wouldn", "wouldn't"}

stop_words = set([w for w in stopwords.words('english') if w not in usefull_words])

In [40]:
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    return " ".join([word for word in words if word not in stop_words])

##### Process

In [41]:
df['no_stopwords'] = df['clean_review'].apply(remove_stopwords)

In [43]:
df[['clean_review', 'no_stopwords']].sample(3)

Unnamed: 0,clean_review,no_stopwords
3033409,"this is a chilly, unremarkable movie about an ...","chilly , unremarkable movie author living/work..."
1148681,gets a signal when other verizon phones won't....,gets signal verizon phones wo n't . beautiful ...
2362479,probably not in a hurry to go back. gave up tr...,probably not hurry go back . gave trying eat c...


#### 7) Stemming

I chose stemming beacuse it's computacional complexity is lower than lemming, 
taking into count that there are many instances of training because of the
permutations done to the original data.

##### Importing libraries

In [45]:
from nltk.stem import PorterStemmer

##### Set up

In [46]:
def stem(text):
    words = nltk.word_tokenize(text)
    stemmer = PorterStemmer()
    return " ".join([stemmer.stem(word) for word in words])    

##### Process

In [47]:
df['stemmed'] = df['no_stopwords'].apply(stem)

In [49]:
df[['no_stopwords', 'stemmed']].sample(3)

Unnamed: 0,no_stopwords,stemmed
3040742,excellent starter wireless headset . cheap che...,excel starter wireless headset . cheap cheerle...
123438,"rather enjoyed . 's sad movie , good .","rather enjoy . 's sad movi , good ."
2283969,"acting sucks , music sucks , script sucks , pa...","act suck , music suck , script suck , pace suc..."


#### 8) Bag Of Words

This method is ideal for short text (such as the ones used here).
Also it has a lower computing complexity when compared to the TF-IDF.

##### Importing library function

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

##### Process

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(data_treino['stemmed'])