# NLP - Predicting Positive or Negative Reviews 

## 1) Creating Functions For Features

In [19]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("my_datacleaned_file.tsv", sep='\t')
data.columns = ['Review', 'Positive/Negative rating', 'Cleaned_Review']

#defining function to establish features - using length of text and punctuation% in review to determine if review is positive or negative
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100 #output number punct as percentage of total words in review

data['body_len'] = data['Review'].apply(lambda x: len(x) - x.count(" ")) #function to count number of words in original reviews
data['punct%'] = data['Review'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text) #splits each word apart in a review for tokenizing
    text = [ps.stem(word) for word in tokens if word not in stopwords] #combine review altogether w/o stopwords
    return text

#future work- make feature function for counting number of specific type of punctuation (! or ?)

## 2) Split into Train/Test

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['Review', 'body_len', 'punct%']], data['Positive/Negative rating'], test_size=0.2)

## 3) Vectorize data

Turn the text into numerical points for the model to easily use for processing.

In [21]:
data_vector = TfidfVectorizer(analyzer=clean_text) #  'char', 'char_wb', 'word', or a callable function., use the clean text to find punctuation and length from
data_vector_fit = data_vector.fit(X_train["Review"])

## error on parameters

#fixed by putting function of text with removed punctuation into analyzer - 9/12

data_train = data_vector_fit.transform(X_train["Review"])
data_test = data_vector_fit.transform(X_test["Review"])

#accepts list of object - connect data based on indicies 
#turn matrix to dataframe
#put data in dataframe side by side
X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(data_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(data_test.toarray())], axis=1)

X_train_vect.head()

#some words are not recogized to be vectorized which is ok

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,1432,1433,1434,1435,1436,1437,1438,1439,1440,1441
0,96,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,74,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,32,9.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,44,9.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4) Run Model and Evaluate Performance

Use a classification model- Random Forest to classify whether review is positive or negative

Analyze how well model worked using precision, recall, and accuracy

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support 

    
#predicting reviews that are negative from file 

# Convert column names to strings in X_train_vect and X_test_vect gets rid of error, must put before code
X_train_vect.columns = X_train_vect.columns.astype(str)
X_test_vect.columns = X_test_vect.columns.astype(str)

# Initialize the RandomForestClassifier
my_rf = RandomForestClassifier(n_estimators=155, max_depth=None, n_jobs=-1)

# Train the model
my_rf_model = my_rf.fit(X_train_vect, y_train)
my_y_pred = my_rf_model.predict(X_test_vect)

# Calculate precision, recall, and fscore
precision, recall, fscore, train_support = precision_recall_fscore_support(y_test, my_y_pred, pos_label="0", average="binary")

# Calculate accuracy
accuracy = (my_y_pred == y_test).sum() / len(my_y_pred)

# Output results in categorizing whether reviews are positive or negative reviews

print("Classifying for Negative Reviews")
print("Precision: {} | Recall: {} | Accuracy: {}".format(
    round(precision, 3), round(recall, 3), round(accuracy, 3)))

print("As a percentage: {}% | {}% | {}%".format(
    round(precision*100,3), round(recall*100,3), round(accuracy*100,3)))


Classifying for Negative Reviews
Precision: 0.726 | Recall: 0.842 | Accuracy: 0.761
As a percentage: 72.65% | 84.158% | 76.119%


In [32]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support 

#predicting reviews that are positive from file 

# Convert column names to strings in X_train_vect and X_test_vect gets rid of error, must put before code
X_train_vect.columns = X_train_vect.columns.astype(str)
X_test_vect.columns = X_test_vect.columns.astype(str)

# Initialize the RandomForestClassifier
my_rf = RandomForestClassifier(n_estimators=155, max_depth=None, n_jobs=-1)

# Train the model
my_rf_model = my_rf.fit(X_train_vect, y_train)
my_y_pred = my_rf_model.predict(X_test_vect)

# Calculate precision, recall, and fscore
precision, recall, fscore, train_support = precision_recall_fscore_support(y_test, my_y_pred, pos_label="1", average="binary")

# Calculate accuracy
accuracy = (my_y_pred == y_test).sum() / len(my_y_pred)

# Output results in categorizing whether reviews are positive or negative reviews
print("Classifying for Positive Reviews")
print("Precision: {} | Recall: {} | Accuracy: {}".format(
    round(precision, 3), round(recall, 3), round(accuracy, 3)))

print("As a percentage: {}% | {}% | {}%".format(
    round(precision*100,3), round(recall*100,3), round(accuracy*100,3)))


Classifying for Positive Reviews
Precision: 0.817 | Recall: 0.67 | Accuracy: 0.761
As a percentage: 81.707% | 67.0% | 76.119%
