# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import seaborn as sns
from wordcloud import WordCloud
import tensorflow as tf
from tensorflow.keras.layers import *

# Importing Dataset

In [2]:
data = pd.read_csv("reviews.csv", usecols = ["Review","Rating"])
data.head()

Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality...",5
1,Please ignore previous negative rating. This a...,5
2,"This pop-up ""Get the best Spotify experience o...",4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didn't ...,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  61594 non-null  object
 1   Rating  61594 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 962.5+ KB


# Cleaning Data

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Niki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Niki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Niki\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
length = len(data.index)
lem = WordNetLemmatizer()  
    
data["Review"] = data["Review"].apply(lambda x : re.sub("[^a-zA-Z0-9-+ ]"," ",x))
data["Review"] = data["Review"].apply(lambda x : " ".join(x.split()))
data["Review"] = data["Review"].apply(lambda x : " ".join(x.lower() for x in x.split()))
data["Review"] = data["Review"].apply(lambda x : " ".join(x for x in x.split() if x not in set(stopwords.words("english"))))
data["Review"] = data["Review"].apply(lambda x : " ".join(lem.lemmatize(word) for word in x.split()))

In [None]:
print(data["Review"])

# EDA

In [None]:
sns.histplot(data["Rating"])
plt.show()

In [None]:
def transform_rating(rating):
    if rating == 4 or rating ==5:
        return "Positive"
    if rating == 3 :
        return "Neutral"
    if rating == 1 or rating ==2:
        return "Negative"

In [None]:
data["Rating"] = data["Rating"].apply(transform_rating)

In [None]:
sns.histplot(data["Rating"])
plt.xlabel("Ratings")
plt.ylabel("No of Ratings")
plt.title("Ratings Distribution")
plt.show()

# Splitting the dataset

In [None]:
X = data["Review"]
Y = data["Rating"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.15, random_state = 0)

# WordCloud

In [None]:
positive = [X[i] for i in range(len(Y)) if Y[i] == "Positive"] 
plt.figure(figsize=(10,10))
wc = WordCloud(width = 1000, height = 600,background_color = "white", max_words = 2500, min_font_size = 12 ).generate(" ".join(positive))
plt.imshow(wc)

In [None]:
negative = [X[i] for i in range(len(Y)) if Y[i] == "Negative"]
plt.figure(figsize = (10,10))
wc = WordCloud(width = 1000, height = 600,background_color = "white", max_words = 2500, min_font_size = 12 ).generate(" ".join(negative))
plt.imshow(wc)

In [None]:
neutral = [X[i] for i in range(len(Y)) if Y[i] == "Neutral"]
plt.figure(figsize = (10,10))
wc = WordCloud(height = 600, width = 1000, background_color = "white", max_words = 2500, min_font_size = 12).generate(" ".join(neutral))
plt.imshow(wc)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 50000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen = 128, truncating = "post", padding = "post")

X_test = tokenizer.texts_to_sequences(X_test)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen = 128, truncating = "post", padding = "post")

In [None]:
X_train[0]

In [None]:
X_test[0]

In [None]:
X_train.shape