In [None]:
import pandas as pd
import re
from textblob import TextBlob
#import emoji
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
data = pd.read_csv(r"/content/drive/MyDrive/archive (2)/Reviews.csv", usecols=["Text","Score"])
data

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...
...,...,...
568449,5,Great for sesame chicken..this is a good if no...
568450,2,I'm disappointed with the flavor. The chocolat...
568451,5,"These stars are small, so you can give 10-15 o..."
568452,5,These are the BEST treats for training and rew...


In [None]:
datac = data.copy()
datac

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...
...,...,...
568449,5,Great for sesame chicken..this is a good if no...
568450,2,I'm disappointed with the flavor. The chocolat...
568451,5,"These stars are small, so you can give 10-15 o..."
568452,5,These are the BEST treats for training and rew...


In [None]:
datac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Score   568454 non-null  int64 
 1   Text    568454 non-null  object
dtypes: int64(1), object(1)
memory usage: 8.7+ MB


### Data Cleaning

In [None]:
datac.isnull().sum()

Score    0
Text     0
dtype: int64

In [None]:
datac = datac.drop_duplicates()

In [None]:
datac.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 393675 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Score   393675 non-null  int64 
 1   Text    393675 non-null  object
dtypes: int64(1), object(1)
memory usage: 9.0+ MB


### EDA

In [None]:
def eda(data,name):
    case = " ".join(data[name]).islower()
    html_ = data[name].apply(lambda x:True if re.search("<.+?>",x) else False).sum()
    url_=data[name].apply(lambda x:True if re.search("http[s]?://.+? +",x) else False).sum()
    punctuation_= data[name].apply(lambda x:True if re.search("()[]!@#$%^&*+=-?<>0-9]",x) else False).sum()

    if case == False:
        print("not in lower case")
    if html_ > 0:
        print("have html tags")
    if url_>0:
        print("have urls")
    if punctuation_>0:
        print("contains punctuation")

In [None]:
eda(datac,"Text")

not in lower case
have html tags
have urls
contains punctuation


### Text Preprocessing

In [None]:
def textpp(x,correct,emojii):

    x=x.lower()
    x=re.sub("<.+?>","",x)
    x=re.sub("http[s]?://.+? +","",x)
    x=re.sub("()[]!@#$%^&*+=-?<>0-9]","",x)
    if correct == 't':
        x=TextBlob(x).correct().string
    else:
        x=x
    if emojii==True:
        x=emoji.demojize(x)
    return x

In [None]:
datac["Text"] = datac["Text"].apply(textpp,args=('s',False))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datac["Text"] = datac["Text"].apply(textpp,args=('s',False))


In [None]:
def advpp(x,stemm):
    ps=PorterStemmer()
    ls=LancasterStemmer()
    ss=SnowballStemmer(language="english")
    wl=WordNetLemmatizer()
    stp=stopwords.words("english")
    lst=[]
    for word in word_tokenize(x):
        if word in stp:
            pass
        else:
            if stemm == 'p':
                lst.append(ps.stem(word))
            elif stemm == 'l':
                lst.append(ls.stem(word))
            elif stemm == 's':
                lst.append(ss.stem(word))
            elif stemm == 'lemma':
                lst.append(wl.lemmatize(word,pos='v'))
            else:
                lst.append(word)

    return " ".join(lst)

In [None]:
datac["Text"] = datac["Text"].apply(advpp, args=("lemma",))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datac["Text"] = datac["Text"].apply(advpp, args=("lemma",))


In [None]:
datac["Text"]

0         buy several vitality can dog food products fin...
1         product arrive label jumbo salt peanuts ... pe...
2         confection around centuries . light , pillowy ...
3         look secret ingredient robitussin believe find...
4         great taffy great price . wide assortment yumm...
                                ...                        
568449    great sesame chicken .. good better resturants...
568450    'm disappoint flavor . chocolate note especial...
568451    star small , give - one train session . try tr...
568452    best treat train reward dog good groom . lower...
568453    satisfy , product advertise , use cereal , raw...
Name: Text, Length: 393675, dtype: object

###  Feature Extraction Techniques
TF-IDF:


In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf=TfidfVectorizer()

In [None]:
tf.fit_transform(datac["Text"])

<393675x132704 sparse matrix of type '<class 'numpy.float64'>'
	with 12913381 stored elements in Compressed Sparse Row format>

In [None]:
tf.vocabulary_

{'buy': 16492,
 'several': 103029,
 'vitality': 125917,
 'can': 17454,
 'dog': 34211,
 'food': 44528,
 'products': 91355,
 'find': 42896,
 'good': 49461,
 'quality': 93172,
 'product': 91220,
 'look': 67834,
 'like': 66547,
 'stew': 109878,
 'process': 91116,
 'meat': 71388,
 'smell': 105811,
 'better': 11401,
 'labrador': 64684,
 'finicky': 43015,
 'appreciate': 6037,
 'arrive': 6755,
 'label': 64628,
 'jumbo': 62347,
 'salt': 99894,
 'peanuts': 85445,
 'actually': 1123,
 'small': 105712,
 'size': 105057,
 'unsalted': 123283,
 'sure': 112491,
 'error': 38866,
 'vendor': 125028,
 'intend': 59448,
 'represent': 96684,
 'confection': 25156,
 'around': 6687,
 'centuries': 19514,
 'light': 66462,
 'pillowy': 87472,
 'citrus': 22471,
 'gelatin': 47685,
 'nut': 79417,
 'case': 18749,
 'filberts': 42752,
 'cut': 28924,
 'tiny': 118393,
 'square': 108768,
 'liberally': 66225,
 'coat': 23176,
 'powder': 89701,
 'sugar': 111580,
 'mouthful': 75500,
 'heaven': 53814,
 'chewy': 20822,
 'flavorful'