## Python Sentiment Analysis Project with NLTK and Hugging Face Transformers:
Author: Thato Seluku

### Read in data:

In [None]:
#Typical DS imports:
import pandas as pd
#Used for data manipulation and analysis, handling dataframes efficiently.
import numpy as np
#Provides support for numerical operations and handling arrays.
import matplotlib.pyplot as plt
#Used for creating static, interactive, and animated visualizations.
import seaborn as sns
#High-level interface for drawing attractive statistical graphics.

plt.style.use('ggplot')
import nltk

In [None]:
#Read in data:
df = pd.read_csv("/kaggle/input/reviews/Reviews.csv")

In [None]:
df.head()

In [None]:
#View the size of the dataset
print(df.shape)
df = df.head(500)
print(df.shape)

## Do a quick Exploratory Data Analysis (EDA):

In [None]:
# Plot a bar chart of review counts by star ratings, sorted by index, with a title and specified size.
ax = df['Score'].value_counts().sort_index() \
.plot(kind='bar',
      title='Count of reviews and stars',
      figsize=(10,5))
ax.set_xlabel('Review Stars')
plt.show()


### Basic NLTK:

In [None]:
example = df['Text'][50]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
#Run nltk on parts of speech to analyse each word:
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
#Pretty print the chunked 10 chars of entities:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

To find out what the abbreviations above mean:
* https://www.kaggle.com/discussions/getting-started/186154

## Sentiment scoring using Vader:
This is how we will get a value of either pos, neg or neutral for each word

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
#Yest sentiment of a positive score: if compound is closer to 1 then sentiment is positive
sia.polarity_scores("This is amazing")

In [None]:
#Yest sentiment of a positive score: if compound is closer to -1 then sentiment is negative
sia.polarity_scores("This is terrible")

In [None]:
sia.polarity_scores(example)

In [None]:
#Run polarity score on entire data set:
#Create dictionary to store results:
res = {}
#loop through the entire dataset and show the progress:
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    #Everytime you run, store data in polarity score of text
    res[myid] = sia.polarity_scores(text)
    

In [None]:
#View results from dictionary - Used PandasDataFrame to make it pretty:
vaders = pd.DataFrame(res).T
vaders= vaders.reset_index().rename(columns={'index': 'Id'})
#Merge vaders to our original df using a left merge:
vaders = vaders.merge(df, how='left')

In [None]:
#Sentiment score and metadata:
vaders.head()

## Plot Vader results:

In [None]:
ax = sns.barplot(data = vaders, x = 'Score', y = 'compound')
ax.set_title("Ecom star review compund scores")
plt.show()

In [None]:
#Plot 3 individual bar charts depicting compound score
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.show()

### From above:
* It is evident that as compound score becomes more positive, the star reviews increase too
* Neutral is mostly flat
* Once the compound score becomes more negative, so do the star reviews

This is the expected result.

# Improving model accuracy: 

Will get back with aletrnative. Roberta model gives network issues

In [99]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
#Note you might encounter a network error below. Will find alternative:
#MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
#tokenizer = AutoTokenizer.from_pretrained(MODEL)
#model = AutoModelForSequenceClassification.from_pretrained(MODEL)

OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like cardiffnlp/twitter-roberta-base-sentiment is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:
# Vader results on example:
#print(example)
#sia.polarity_scores(example)

In [None]:
#Run for Roberta Model:
#tokenizer(example, return_tensors='pt')