In [18]:
#importing libraries
#download nltk packages
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import joblib

#reading labelled dataset
df = pd.read_csv("/content/Labelled_Dataset.csv")
#Endcoding
df['Label']=df['Label'].map({'Positive':1,'Negative':0})
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

def cleaner(TEXT):
    soup = BeautifulSoup(TEXT, 'lxml')
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped)
    re2 = re.sub("[^A-Za-z]+"," ", re1)

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

df['Cleaned_TEXT'] = df.Review.apply(cleaner)
df = df[df['Cleaned_TEXT'].map(len) > 0]
print("Printing top 5 rows of dataframe showing original and cleaned tweets....")
print(df[['Review','Cleaned_TEXT']].head())
df.drop(['Review'], axis=1, inplace=True)

df.to_csv('CA_2.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Printing top 5 rows of dataframe showing original and cleaned tweets....
                                                                                                                                       Review  \
0  This deodorant is a game-changer! The refreshing scent lasts all day, keeping me confident. It's my go-to for staying fresh and odor-free.   
1                 Absolute freshness in a bottle! This deodorant's long-lasting protection and pleasant fragrance make it my daily essential.   
2                  Say goodbye to sweat worries! This deodorant keeps me dry and smelling fantastic. A reliable choice for all-day freshness.   
3            Embrace confidence with this deodorant! The invigorating scent and effective protection make it a must-have in my daily routine.   
4                              Unbeatable freshness! This deodorant's crisp scent and 24-hour protection keep me feeling clean and confident.   

                                                        

In [19]:
df.to_csv('CA_2.csv', index=False)
df['Cleaned_TEXT'] = [" ".join(row) for row in df['Cleaned_TEXT'].values]
data = df['Cleaned_TEXT']
Y = df['Label']
tfidf = TfidfVectorizer(min_df=.015, ngram_range=(1,3))
tfidf.fit(data)
data_tfidf = tfidf.transform(data)
pd.DataFrame(pd.Series(tfidf.get_feature_names_out())).to_csv('Vocab.csv', header=False, index=False)
print("Shape of tfidf matrix: ", data_tfidf.shape)


Shape of tfidf matrix:  (1000, 465)


In [20]:
#using linear SVC
svc_clf = LinearSVC()
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y[train_index]
    X_test, Y_test = data_tfidf[test_index], Y[test_index]

    svc_clf.fit(X_train, Y_train)
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred)
    print("Cross-validation accuracy: ", score)
    scores.append(score)
svc_mean_accuracy = np.mean(scores)
print("SVC Mean cross-validation accuracy: ", svc_mean_accuracy)

#using multinominalNB
nbc_clf = MultinomialNB()
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y[train_index]
    X_test, Y_test = data_tfidf[test_index], Y[test_index]
    nbc_clf.fit(X_train, Y_train)
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred)
    print("Cross-validation accuracy: ", score)
    scores.append(score)
nbc_mean_accuracy = np.mean(scores)
print("NBC Mean cross-validation accuracy: ", nbc_mean_accuracy)


Iteration  1
Cross-validation accuracy:  1.0
Iteration  2
Cross-validation accuracy:  1.0
Iteration  3
Cross-validation accuracy:  0.995
Iteration  4
Cross-validation accuracy:  0.99
Iteration  5
Cross-validation accuracy:  1.0
SVC Mean cross-validation accuracy:  0.9970000000000001
Iteration  1
Cross-validation accuracy:  1.0
Iteration  2
Cross-validation accuracy:  0.995
Iteration  3
Cross-validation accuracy:  1.0
Iteration  4
Cross-validation accuracy:  1.0
Iteration  5
Cross-validation accuracy:  0.995
NBC Mean cross-validation accuracy:  0.998


In [21]:
#Saving SVC file
clf = LinearSVC().fit(data_tfidf, Y)
joblib.dump(clf, 'svc.sav')

['svc.sav']

In [22]:
model = joblib.load('svc.sav')
vocabulary = pd.read_csv('Vocab.csv', header=None)
vocabulary_dict = {}
for i, word in enumerate(vocabulary[0]):
      vocabulary_dict[word] = i
print(vocabulary_dict)
tfidf = TfidfVectorizer(vocabulary = vocabulary_dict,lowercase=False)

# Reading new data as dataframe
df = pd.read_csv("/content/Unlabelled_dataset.csv")
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Cleaning reviews
def cleaner(Review):
    soup = BeautifulSoup(Review, 'lxml')
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped)
    re2 = re.sub("[^A-Za-z]+"," ", re1)


    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

df['cleaned_TEXT'] = df.Review.apply(cleaner)
df = df[df['cleaned_TEXT'].map(len) > 0]
print("Original and cleaned reviews")
print(df[['Review','cleaned_TEXT']].head())
df['cleaned_TEXT'] = [" ".join(row) for row in df['cleaned_TEXT'].values]
data = df['cleaned_TEXT']
tfidf.fit(data)
data_tfidf = tfidf.transform(data)
y_pred = model.predict(data_tfidf)

#Saving predicted ratings to csv
df['predicted_Label'] = y_pred.reshape(-1,1)
df.to_csv('predicted_Label.csv', index=False)

{'absolute': 0, 'absolute essential': 1, 'absolute essential staying': 2, 'active': 3, 'active lifestyle': 4, 'adventure': 5, 'advertised': 6, 'ally': 7, 'ally confidently': 8, 'ally confidently conquering': 9, 'application': 10, 'application deodorant': 11, 'application deodorant clean': 12, 'armpit': 13, 'aroma': 14, 'aroma effective': 15, 'aroma effective hour': 16, 'aroma hour': 17, 'aroma hour protection': 18, 'around': 19, 'artificial': 20, 'awful': 21, 'beauty': 22, 'beauty ritual': 23, 'booster': 24, 'booster bottle': 25, 'booster bottle deodorant': 26, 'bottle': 27, 'bottle deodorant': 28, 'bottle deodorant delightful': 29, 'breeze': 30, 'breeze every': 31, 'breeze every day': 32, 'bursting': 33, 'bursting confidence': 34, 'bursting confidence deodorant': 35, 'caused': 36, 'caused skin': 37, 'causing': 38, 'cheap': 39, 'cheap deodorant': 40, 'chemical': 41, 'choice': 42, 'choice feeling': 43, 'choice feeling confidently': 44, 'clean': 45, 'clean aroma': 46, 'clean aroma effect