# Sentimental Analysis of Open Text Comments 
> <b> Created by: Alexis Soto-Colorado
 

In [None]:
#Loading Libraries 

import re, nltk
import numpy as np
import pandas as pd
import csv
import qgrid
import sys
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from datetime import datetime
from ipywidgets import widgets

In [None]:
%matplotlib inline

<b> 1. Select the date in which to run the sentimental analysis

In [None]:
# Loading Comment Data from Survey Gizmo

test_data_df = pd.read_csv('/YourCSV.csv', 
                           encoding="ISO-8859-1", engine='python')


qgrid.test = qgrid.show_grid(test_data_df, show_toolbar=True)
qgrid.test

In [None]:
test_data_df = qgrid.test.get_changed_df()

In [None]:
boton = widgets.Button(description="Save Selection")

boton

<b> 2. Comment to be analysis

In [None]:
#Eliminate Date 
test_data_df = test_data_df.iloc[:,1:2]
test_data_df

In [None]:
#Change Column Names
columns_names = ["Text"]
test_data_df.columns = columns_names
test_data_df

In [None]:
#Loading Train Data 
#Most Important Resource
train_data_df = pd.read_csv('/YourTrainData.csv',
                            encoding="ISO-8859-1", engine='python')


    

    
#Leave Sentiment and Text Columns Only 
#train_data_df = train_data_df.iloc[:,1:3]

train_data_df



In [None]:
#Remove N/A
train_data_df = train_data_df[train_data_df['Text'].notnull()]
test_data_df = test_data_df[test_data_df['Text'].notnull()]

In [None]:
#List Comments to Eliminate

eliminate_comment = ["no", 
                     "No", 
                     "NO", 
                     "Thank you",
                     "Thank You", 
                     "thank you", 
                     "THANKS", 
                     "thanks",
                     "No, thanks.",
                     "No, thanks",
                     "No, Thank you"]

In [None]:
test_data_df = test_data_df[test_data_df.Text != "No"]
test_data_df = test_data_df[test_data_df.Text != "no"]
test_data_df = test_data_df[test_data_df.Text != "Thank you"]
test_data_df = test_data_df[test_data_df.Text != "thank you"]
test_data_df = test_data_df[test_data_df.Text != "THANKS"]
test_data_df = test_data_df[test_data_df.Text != "thanks"]
test_data_df = test_data_df[test_data_df.Text != "Thanks"]
test_data_df = test_data_df[test_data_df.Text != "No, thanks."]
test_data_df = test_data_df[test_data_df.Text != "No, Thanks"]
test_data_df = test_data_df[test_data_df.Text != "No, thanks"]
test_data_df = test_data_df[test_data_df.Text != "No, thanks"]
test_data_df = test_data_df[test_data_df.Text != "No, thank you."]
test_data_df = test_data_df[test_data_df.Text != "No thank you"]
test_data_df = test_data_df[test_data_df.Text != "No, thank you!"]
test_data_df = test_data_df[test_data_df.Text != "No."]
test_data_df = test_data_df[test_data_df.Text != "Nope"]
test_data_df = test_data_df[test_data_df.Text != "Nope!"]
test_data_df = test_data_df[test_data_df.Text != "No."]

In [None]:
train_data_df = train_data_df[train_data_df.Text != "No"]
train_data_df = train_data_df[train_data_df.Text != "no"]
train_data_df = train_data_df[train_data_df.Text != "Thank you"]
train_data_df = train_data_df[train_data_df.Text != "thank you"]
train_data_df = train_data_df[train_data_df.Text != "THANKS"]
train_data_df = train_data_df[train_data_df.Text != "Thanks"]
train_data_df = train_data_df[train_data_df.Text != "thanks"]
train_data_df = train_data_df[train_data_df.Text != "Thanks"]
train_data_df = train_data_df[train_data_df.Text != "No, thanks."]
train_data_df = train_data_df[train_data_df.Text != "No, Thanks"]
train_data_df = train_data_df[train_data_df.Text != "No, thanks"]
train_data_df = train_data_df[train_data_df.Text != "No, thank you."]
train_data_df = train_data_df[train_data_df.Text != "No thank you"]
train_data_df = train_data_df[train_data_df.Text != "No thank you."]
train_data_df = train_data_df[train_data_df.Text != "No, thank you!"]
train_data_df = train_data_df[train_data_df.Text != "No thank you!"]
train_data_df = train_data_df[train_data_df.Text != "Thanks."]
train_data_df = train_data_df[train_data_df.Text != "Nope"]
train_data_df = train_data_df[train_data_df.Text != "Nope!"]
train_data_df = train_data_df[train_data_df.Text != "No."]

In [None]:
#Mean length of the text 
print(np.mean([len(s.split(" ")) for s in train_data_df.Text]))

In [None]:
# Based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [None]:
def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

In [None]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [None]:
corpus_data_features = vectorizer.fit_transform(
    train_data_df.Text.tolist() + test_data_df.Text.tolist())

In [None]:
corpus_data_features_nd = corpus_data_features.toarray()
print(corpus_data_features_nd.shape)

In [None]:
vocab = vectorizer.get_feature_names()
print(vocab)

In [None]:
dist = np.sum(corpus_data_features_nd, axis=0)

In [None]:
for tag, count in zip(vocab, dist):
    print(count, tag)

In [None]:
###################Train Model##################################

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    corpus_data_features_nd[0:len(train_data_df)],
    train_data_df.Sentiment,
    train_size = 0.85,
    random_state = 1234)


In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)

In [None]:
y_pred = log_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


In [None]:
### Train Classfiers ###

In [None]:
log_model = LogisticRegression()
log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Sentiment)

In [None]:
# get predictions
test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])

In [None]:
# sample some of them
# I am sampling the entire population

import random
spl = random.sample(range(len(test_pred)), 26)

In [None]:
#Data Frame - Empty

lista_sentimientos = pd.DataFrame()

In [None]:
# print text and labels
for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
    print(sentiment, text)
    

<b> 3. Classification Rate of the Sentimental Analysis: 

In [None]:
print ("Classification Rate:", log_model.score(X_test, y_test))


In [None]:
# Making a Function 
def sentimientos(text, sentiment):
    lista_stuff = []
    for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
        lista_stuff.append([sentiment,text])
    return lista_stuff
    

In [None]:
lista_sentimientos = sentimientos(text, test_pred)

In [None]:
labels = ["Sentiment", "Text"]
df_sentimientos = pd.DataFrame.from_records(lista_sentimientos, columns=labels)
df_sentimientos = df_sentimientos.assign(Add_Train=False)

In [None]:
#Dealing with Encoding Issues
df_sentimientos.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)


<b> 4. Comments by Sentiment:
 * 1 is positive and 0 is negative.
 * Modification can be made to the results interactively.
 * In order to save any modification to the train data, the user need to click on the columuns 

In [None]:
#View Comment Results  
qgrid.sentimientos = qgrid.show_grid(df_sentimientos, show_toolbar=True)
qgrid.sentimientos

In [None]:
sentimientos = qgrid.sentimientos.get_changed_df()

In [None]:
#sentimientos

<b> 5. Entries to be added to train data 

In [None]:
add_train = sentimientos.loc[sentimientos['Add_Train'] == True ] 
add_train


In [None]:
#Frame = Frame.append(pandas.DataFrame(data = SomeNewLineOfData), ignore_index=True)

frames = [train_data_df, add_train]
train_data = pd.concat(frames)

In [None]:
#Create a variable name based on the date
nombre_data = str('train_data' +'_'+ datetime.now().strftime('%Y-%m-%d'))
nombre_data[0:21]

In [None]:
#Drop Duplicate
train_data = train_data.drop_duplicates(subset=['Text'], keep=False)

In [None]:
#Eliminate the Column Add_Train
#df1 = df.iloc[:,0:2]

train_data = train_data.iloc[:,1:3]

In [None]:
#Save the train data 

#Make sure of changing names

train_data.to_csv('/YourFileDirectory/Yourcsvname.csv', sep=",")

<b> 6. Data is save in the folder as csv
   * In order to use the new training data, please run the code from the begenning.

In [None]:
df_sentimientos.to_csv('/YourFileDirectory/Yourcsvname.csv', sep=",")