# <u><em>SENTIMENT ANALYZER<em></u>

## Import libraries

In [16]:
import re
import joblib
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\piuso\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load our dataset

In [2]:
data = pd.read_csv('data_reviews.csv')

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.shape

(50000, 2)

## Function that preprocesses our dataframe

In [4]:
def preprocess_text(df, text_col):
    # Remove URLs
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'http\S+', '', x))
    
    # Remove special characters
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    
    # Convert text to lowercase
    df[text_col] = df[text_col].apply(lambda x: x.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    df[text_col] = df[text_col].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.casefold() not in stop_words]))
    
    return df

## Preprocess our dataframe by passing the function to our data frame

In [6]:
preprocessed_df = preprocess_text(data, 'review')

In [7]:
preprocessed_df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


## Support Vector Machines (SVM) model

In [20]:
# Split the dataset into training and testing data
train_data, test_data, train_labels, test_labels = train_test_split(preprocessed_df['review'], preprocessed_df['sentiment'], test_size=0.2, random_state=42)

# Extract features from the text data using TF-IDF vectorizer
vectorizer1 = TfidfVectorizer(max_features=1000)
train_features = vectorizer1.fit_transform(train_data)
test_features = vectorizer1.transform(test_data)

# Train an SVM model on the training data
model = SVC(kernel='linear', C=1.0)
model.fit(train_features, train_labels)

# Predict the sentiments of the testing data using the trained model
pred_labels = model.predict(test_features)

# Evaluate the performance of the model using accuracy score
accuracy = accuracy_score(test_labels, pred_labels)
report = classification_report(test_labels, pred_labels)
print("Accuracy: {:.2f}%".format(accuracy*100))
print('Classification Report:\n', report)

Accuracy: 86.35%
Classification Report:
               precision    recall  f1-score   support

    negative       0.87      0.85      0.86      4961
    positive       0.86      0.88      0.87      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



## Function to preprocess text that we would like to use with our trained model

In [21]:
def preprocess(text):
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)# Remove special characters
    text = text.lower() # Convert to lowercase
    tokens = word_tokenize(text) # Tokenize the text
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words] # Remove the stopwords   
    filtered_text = ' '.join(filtered_tokens) # Join the filtered tokens back into a string
    return filtered_text

## Inline prompt that asks user for text that will analyze and give sentiment

In [28]:
text = input("Enter your text: ")
new_text = [preprocess(text)]
new_features = vectorizer1.transform(new_text)
predicted_label = model.predict(new_features)
print(predicted_label)

Enter your text: Go there
['positive']


### Save the trained model to my local machine

In [18]:
joblib.dump(model, 'svm_sentiment_model.joblib')

['svm_sentiment_model.joblib']