# Sentiments Analysis
## Based on comments
Data Source: https://archive.ics.uci.edu/dataset/331/sentiment+labelled+sentences

#### 1) Opening the data from files

In [1]:
files_folder = './data/'
files_paths = [files_folder + i for i in ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']]

#### 2) Putting the file data into lists

In [2]:
positive_reviews = []
negative_reviews = []

for file_path in files_paths:
    file = open(file_path, 'r')
    for line in file:
        suffix = -3
        if file_path == './data/imdb_labelled.txt':
            suffix = -5
        
        if int(line[-2]):
            line = line[:suffix]
            positive_reviews.append(line)
        else:
            line = line[:suffix]
            negative_reviews.append(line)

#### 3) Doing permutations

##### Calling library function

In [3]:
from itertools import permutations

##### Positive

In [4]:
positive_reviews_permutations = []

for i in range(2):
    permutations_list = [" ".join(k) for k in permutations(positive_reviews, i+1)]
    positive_reviews_permutations += permutations_list

del positive_reviews

##### Negative

In [5]:
negative_reviews_permutations = []

for i in range(2):
    permutations_list = [" ".join(k) for k in permutations(negative_reviews, i+1)]
    negative_reviews_permutations += permutations_list

del negative_reviews

#### 4) Dividing the data (Train/Test)

##### Mixing the records

In [6]:
from random import seed, shuffle

seed(777)
shuffle(positive_reviews_permutations)
shuffle(negative_reviews_permutations)

##### Positive

In [8]:
positive_quarter = len(positive_reviews_permutations) // 4
test_positive_reviews = positive_reviews_permutations[-positive_quarter:]
train_positive_reviews = positive_reviews_permutations[:-positive_quarter]

del positive_reviews_permutations

##### Negative

In [10]:
negative_quarter = len(negative_reviews_permutations) // 4
test_negative_reviews = negative_reviews_permutations[-negative_quarter:]
train_negative_reviews = negative_reviews_permutations[:-negative_quarter]

del negative_reviews_permutations

#### 5) Building the databases (Train/Test)

In [12]:
import pandas as pd

##### Train

In [13]:
train_dict = {
    'review': train_positive_reviews + train_negative_reviews,
    'sentiment': [1 for i in train_positive_reviews] + [0 for i in train_negative_reviews]
}

train_df = pd.DataFrame(train_dict)
del train_dict

###### Samples

In [14]:
train_df['sentiment'].value_counts()

sentiment
1    1687500
0    1687500
Name: count, dtype: int64

##### Test

In [15]:
test_dict = {
    'review': test_positive_reviews + test_negative_reviews,
    'sentiment': [1 for i in test_positive_reviews] + [0 for i in test_negative_reviews]
}

test_df = pd.DataFrame(test_dict)
del test_dict

###### Samples

In [16]:
test_df['sentiment'].value_counts()

sentiment
1    562500
0    562500
Name: count, dtype: int64

#### 6) Clear texts

##### Importing library function

In [17]:
from re import sub

##### Clear function

In [18]:
def clean_text(text):
    return text.lower()

##### Train DF

In [20]:
train_df['clean_review'] = train_df['review'].apply(clean_text)
train_df[['review', 'clean_review']].sample(3)

Unnamed: 0,review,clean_review
684135,"Nice, spicy and tender. The Greek dressing was...","nice, spicy and tender. the greek dressing was..."
2147202,The Ngage is still lacking in earbuds. So medi...,the ngage is still lacking in earbuds. so medi...
1419244,I'm a big fan of this series mostly due to Ann...,i'm a big fan of this series mostly due to ann...


##### Test DF

In [21]:
test_df['clean_review'] = test_df['review'].apply(clean_text)
test_df[['review', 'clean_review']].sample(3)

Unnamed: 0,review,clean_review
268671,"After watching this film, I wanted to learn mo...","after watching this film, i wanted to learn mo..."
1077156,The movie has almost no action scenes in it an...,the movie has almost no action scenes in it an...
105829,So good I am going to have to review this plac...,so good i am going to have to review this plac...


#### 7) Stop Words Removal

##### Importing libraries

In [38]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/stnz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/stnz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /home/stnz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

##### Set up

In [39]:
usefull_words = {"no", "nor", "not", "don'", "don't", "ain", "aren", "aren't", 
                 "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't",
                 "hadn", "hadn't", "hasn", "hasn't", "haven't", "isn", "isn't",
                 "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't",
                 "shan", "shan't", "shan't", "shouldn", "shouldn't", "wasn",
                 "wasn't", "weren", "weren't", "won't", "wouldn", "wouldn't"}

stop_words = set([w for w in stopwords.words('english') if w not in usefull_words])

In [40]:
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    return " ".join([word for word in words if word not in stop_words])

##### Train

In [41]:
train_df['no_stopwords'] = train_df['clean_review'].apply(remove_stopwords)

In [43]:
train_df[['clean_review', 'no_stopwords']].sample(3)

Unnamed: 0,clean_review,no_stopwords
3033409,"this is a chilly, unremarkable movie about an ...","chilly , unremarkable movie author living/work..."
1148681,gets a signal when other verizon phones won't....,gets signal verizon phones wo n't . beautiful ...
2362479,probably not in a hurry to go back. gave up tr...,probably not hurry go back . gave trying eat c...


##### Test

In [42]:
test_df['no_stopwords'] = test_df['clean_review'].apply(remove_stopwords)

In [44]:
test_df[['clean_review', 'no_stopwords']].sample(3)

Unnamed: 0,clean_review,no_stopwords
671391,"to sum the film up, ""breeders"" is a terrible, ...","sum film , `` breeders '' terrible , cheaply m..."
464826,kids pizza is always a hit too with lots of gr...,kids pizza always hit lots great side dish opt...
692617,i came out of it feeling angry. stay away from...,"came feeling angry . stay away store , careful ."
