# Sentiment Analysis

## Dataset
The dataset contains columns `polarity`, `title`, and `text`. Polarity can be either 1 (negative) or 2 (positive).

**In order to save time, only 40,000 rows of the train data set and 40,000 rows of the test data set will be used.**

## Read in, clean, and preprocess data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [None]:
df = pd.read_csv('/kaggle/input/amazon-reviews/train.csv',  header=None, nrows=40000)
df.columns = ['Polarity', 'Title', 'Review']
df.shape

In [None]:
df = df[['Polarity', 'Review']].reset_index(drop=True)
df.head(10)

In [None]:
# Count the null values
df.isnull().sum()

**The are no null values.**

In [None]:
# Count polarities
df.Polarity.value_counts(normalize = True)

In [None]:
sns.set_palette(['red', 'green'])
sns.countplot(x=df['Polarity'])
plt.title('Polarity Counts')
plt.show()

**The data is balanced.**

In [None]:
def preprocess_text(text):
    # Make text lowercase and remove links, text in square brackets, punctuation, and words containing numbers
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+|\[.*?\]|[^a-zA-Z\s]+|\w*\d\w*', '', text)
    
    # Remove stop words
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words).strip()

In [None]:
# Apply preprocess_text to all data
df['Review'] = df['Review'].apply(preprocess_text)

In [None]:
df.head(10)

In [None]:
def stem_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Stem each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the stemmed tokens back into a single string
    return ' '.join(stemmed_tokens)

In [None]:
# Apply stem_text to all data
df['Review'] = df['Review'].apply(stem_text)

In [None]:
df.head(10)

## Exploration Data Analysis (EDA)

In [None]:
# Count words from the 'Review' column
count = Counter(' '.join(df['Review']).split())

# Create a DataFrame from the word counts
words = pd.DataFrame(count.items(), columns=['Words', 'Frequency'])

# Sort by frequency and reset the index
words = words.sort_values('Frequency', ascending=False).reset_index(drop=True)

# Add a Rank column
words['Rank'] = words.index + 1
words = words[['Rank', 'Words', 'Frequency']]

words.head(20)


In [None]:
def generate_wordcloud(input):
    cloud = WordCloud(width=1500, height=800, max_words=500, background_color='black', colormap='coolwarm')
    wordcloud = cloud.generate(input)
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
# Wordcloud for positive reviews
positive_words = " ".join(df[df['Polarity'] == 2]['Review'])
generate_wordcloud(positive_words)

In [None]:
# Wordcloud for negative reviews
negative_words = " ".join(df[df['Polarity'] == 1]['Review'])
generate_wordcloud(negative_words)

In [None]:
def show_top_words(input, type):
    words = input.split()
    top_words = pd.DataFrame(Counter(words).most_common(25), columns=['Term', 'Count'])
    sns.set_theme(style="whitegrid")
    plt.figure(figsize=(12, 10))
    sns.barplot(x='Count', y='Term', data=top_words, palette='coolwarm')
    plt.title(f"Top 25 Words in {type} Reviews")
    plt.xlabel("Occurrences")
    plt.ylabel("Terms")
    plt.tight_layout()
    plt.show()

In [None]:
show_top_words(positive_words, "Positive")

In [None]:
show_top_words(negative_words, "Negative")

## Training model

In [None]:
# Instantiate the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(df['Review'])
y_train = df['Polarity']

# Initialize the classifier
clf = LinearSVC()

# Train the classifier
clf.fit(X_train_tfidf, y_train)

In [None]:
# Reading the test dataset
test_df = pd.read_csv('/kaggle/input/amazon-reviews/test.csv', header=None, nrows=40000)
test_df.columns = ['Polarity', 'Title', 'Review']
test_df = test_df[['Polarity', 'Review']].reset_index(drop=True)

In [None]:
# Preprocessing test data
test_df['Review'] = test_df['Review'].apply(preprocess_text)
test_df['Review'] = test_df['Review'].apply(stem_text)

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(test_df['Review'])

In [None]:
y_test = test_df['Polarity']
y_pred = clf.predict(X_test_tfidf)

In [None]:
# Printing out results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
def predict_sentiment(text):
    # Preprocess the input text
    preprocessed_text = stem_text(preprocess_text(text))
    
    # Transform the text using the trained tfidf_vectorizer
    features = tfidf_vectorizer.transform([preprocessed_text])
    
    # Predict using the trained classifier
    prediction = clf.predict(features)[0]
    
    # Return the sentiment
    if prediction == 1:
        return "Negative"
    else:
        return "Positive"

In [None]:
print(predict_sentiment("This has to be the worst software I've ever tried. Constant crashes and glitches!"))

In [None]:
print(predict_sentiment("Great value for the price! The product quality exceeded my expectations."))

In [None]:
print(predict_sentiment("Had high hopes after reading other reviews, but this just wasn't for me."))

In [None]:
print(predict_sentiment("Installation was a breeze and it integrated perfectly with my existing setup."))

In [None]:
print(predict_sentiment("Can't believe I wasted money on this. Extremely disappointing."))

In [None]:
print(predict_sentiment("The product description was misleading and the actual item was subpar."))

# Write a review

In [None]:
# Uncomment the lines below and run

# print("Write a review (and press Enter):")
# review = input()
# print("Sentiment: " + predict_sentiment(review))