# Libs

In [1]:
%pip install TextBlob

Collecting TextBlob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting nltk>=3.8
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting regex>=2021.8.3
  Downloading regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl (291 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.0/291.0 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 

In [None]:
### comment

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
import pickle
from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm
from textblob import TextBlob
from nltk.corpus import wordnet, stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from warnings import filterwarnings
import math
import hashlib
from databricks import feature_store
from databricks.feature_store import *
filterwarnings('ignore')

ModuleNotFoundError: No module named 'databricks'

# Dowload nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

# Load Customer Reviews Data

In [None]:
# Loading Data
data = spark.sql("select * from projects.customer_review_dataset").toPandas()

# replacement dictionary generated from EDA
with open('replace_dict.pkl', 'rb') as f:
  replacement_dict = pickle.load(f)

# Functions

In [None]:
def replace_shortwords(replacement_dict,text):
  """function to replace short words to proper words using a replacement dictionary eg. awsm to awesome
  replacement_dict : dict : dictionary created in EDA process
  text : string : text which need to be treated"""
  word_list= word_tokenize(text)
  for r in replacement_dict.keys():
      word_list=list(pd.Series(word_list).replace(r,replacement_dict[r]))
  return " ".join(word_list)

def generate_hash_key(row):
  """function to generate unique hash key from columns in dataframe
  row : pd.Series : row in pandas dataframe"""
  columns = [row.Rate, row.Sentiment, row.combined_cleaned,row.combined_cleaned_lemmatized,row.index] # replace with the actual column names
  concatenated_data = ''.join(str(column) for column in columns)
  hash_key = hashlib.md5(concatenated_data.encode()).hexdigest()
  return hash_key

stopwords_list = stopwords.words('english')
def ReviewProcessing(df):
  """function to remove stopwords
  df : pd.DataFrame : Dataframe which contains 'combined_cleaned' column"""
  # remove non alphanumeric
  df['combined_cleaned'] = df.combined_cleaned.str.replace('[^a-zA-Z0-9 ]', '')
  # lowercase
  df.combined_cleaned = df.combined_cleaned.str.lower()
  # split into list
  df.combined_cleaned = df.combined_cleaned.str.split(' ')
  # remove stopwords
  df.combined_cleaned = df.combined_cleaned.apply(lambda x: [item for item in x if item not in stopwords_list])
  return df

def get_wordnet_pos(word):
  """function to perform Lemmatization on text
  word : string : word in a paragraph"""
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()

def get_lemmatize(sent):
  """function to perfrom Lemmatization on text
  sent : string : sentence"""
  return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sent)])

def polarity(text):
  """function to calculate polarity score using TextBlob from a paragraph"""
  return TextBlob(text).sentiment.polarity

# Transformations

## Filtering and Cleaning

In [None]:
# Removing Null Records from Summary
data=data.dropna(subset='Summary')

# Filling Null records in Review with blank
data['Review']=data['Review'].fillna('')

# Removing discrepencies in Rate column
try:
    data = data[~data['Rate'].isin(['Pigeon Favourite Electric Kettle??????(1.5 L, Silver, Black)',
        'Bajaj DX 2 L/W Dry Iron',
        'Nova Plus Amaze NI 10 1100 W Dry Iron?Ã\x83Â¿?Ã\x83Â¿(Grey & Turquoise)'])]
except:
    pass

# Convert Rate to numeric
data['Rate'] = pd.to_numeric(data['Rate'])

# Since we are doing sentiment classification, We don't need Product Price, so let's drop it
data.drop(['product_price','product_name'],axis=1,inplace=True)

# Replacing Null records in Review with blank
data['Review']=data['Review'].fillna('')
data['combined']=data['Review']+' '+data['Summary']

## Replacing short words to proper words

In [None]:
# Replacing short words with proper words
data=data.reset_index(drop=True)
result = Parallel(verbose = 0, n_jobs=multiprocessing.cpu_count())(delayed(replace_shortwords)(replacement_dict=replacement_dict,text=x) for x in tqdm(data['combined']))
data['combined_cleaned']=pd.Series(result)

## Polarity and filtering

In [None]:
# We can calculate polarity from combined text now and filter those records which doesn't make sense i.e eg. sentiment is negative but polarity is >0.8
data['polarity_score_combined']=data['combined_cleaned'].apply(lambda x : polarity(x))

# Remove these rows because these are adding noise
data = data[~((data['Sentiment']=='positive')&(data['polarity_score_combined']<0))]

# Remove these rows because these are adding noise
data = data[~((data['Sentiment']=='negative')&(data['polarity_score_combined']>0.3))]

# Remove these rows because these are adding noise
data=data[~((data['Sentiment']=='neutral')&(data['polarity_score_combined']>0.7))]

# Remove these rows because these are adding noise
data=data[~((data['Sentiment']=='neutral')&(data['polarity_score_combined']<-0.5))]

# Remove these records, these are adding noise
data=data[~((data['Sentiment']=='positive')&(data['Rate']<3))]

# Remove these records, these are adding noise
data=data[~((data['Sentiment']=='negative')&(data['Rate']>3))]

# Remove these records, these are adding noise 
data=data[~((data['Sentiment']=='neutral')&((data['Rate']>4)|(data['Rate']<2)))]

# Drop columns which are not required
data.drop(['Review','Summary','combined','polarity_score_combined'],axis=1,inplace=True)

## Lemmatization

In [None]:
clean_data = ReviewProcessing(data)
clean_data.combined_cleaned = clean_data.combined_cleaned.apply(' '.join)
clean_data['combined_cleaned_lemmatized'] = clean_data.combined_cleaned.apply(get_lemmatize)
clean_data=clean_data.drop_duplicates()

## Create unique 'id' - hashkey for feature table

In [None]:
clean_data['id']=clean_data.apply(lambda x:generate_hash_key(x),axis=1)

# Feature Table Creation

In [None]:
# Create Feature Table
fs = FeatureStoreClient()
spark.sql("Create database if not exists feature_store")
feature_table_name="feature_store.customer_sentiment_analysis_01"
spark_df=spark.createDataFrame(clean_data)
try:
    # creating new feature table everytime
    fs.drop_table(feature_table_name)
except:
    pass

fs.create_table(
    name=feature_table_name,
    primary_keys=['id'],
    df=spark_df,
    description="Customer Sentiment Analysis Transformed Dataset"
)