### Setting Work Envirement

In [7]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
import nltk

### Load and preprocess the dataset


In [5]:
#loading the dataset 
df=pd.read_csv('..\Dataset\Reviews_clean.csv')

In [8]:
# Preprocess the text data
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Firas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Firas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    words = [ps.stem(w) for w in words if w.isalpha() and w not in stop_words]
    return ' '.join(words)
df['ProcessedText'] = df['Text'].apply(preprocess_text)

In [10]:
df

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,TextLength,HelpfulnessRange,ProcessedText
0,150523,150524,0006641040,ACITT7DI6IDDL,shari zychinski,0,0,5,1999-10-08,EVERY book is educational,this witty little book makes my son laugh at l...,375,,witti littl book make son laugh loud recit car...
1,150500,150501,0006641040,AJ46FKXOVC7NR,Nicholas A Mesiano,2,2,5,1999-10-25,This whole series is great way to spend time w...,I can remember seeing the show when it aired o...,407,0-10,rememb see show air televis year ago child sis...
2,451855,451856,B00004CXX9,AIUWLEQ1ADEG5,Elizabeth Medina,0,0,5,1999-12-02,Entertainingl Funny!,Beetlejuice is a well written movie ..... ever...,166,,beetlejuic well written movi everyth excel act...
3,230284,230285,B00004RYGX,A344SMIA5JECGM,Vincent P. Ross,1,2,5,1999-12-06,A modern day fairy tale,"A twist of rumplestiskin captured on film, sta...",222,0-10,twist rumplestiskin captur film star michael k...
4,451877,451878,B00004CXX9,A344SMIA5JECGM,Vincent P. Ross,1,2,5,1999-12-06,A modern day fairy tale,"A twist of rumplestiskin captured on film, sta...",222,0-10,twist rumplestiskin captur film star michael k...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568406,504113,504114,B001M0AKE8,A07112861KSNE1D0ZA1NO,Canon Fan,0,0,4,2012-10-26,Quaker Instant Oatmeal Dinosaur Eggs,This has been a firm favorite at breakfast tim...,163,,firm favorit breakfast time hous mani year pic...
568407,506493,506494,B0006UFY46,AEEVDQNVIH4SJ,Eugene M. Watson,0,0,5,2012-10-26,Great Salmon,This Salmon Is The Best! I Try To Make A Smal...,150,,salmon best tri make small varieti salmon dish...
568408,401353,401354,B000CQG8B2,A26DOVGY14V7NX,Richard Rollins,0,0,5,2012-10-26,breakfast tea,We switch to this decaf tea at night for a gre...,112,,switch decaf tea night great cup tea sleep pro...
568409,83328,83329,B005ZBZLT4,A308RR8J9NJOOZ,Josh,0,0,5,2012-10-26,One of the best!,I am recently new to the Keurig world. I've t...,279,,recent new keurig world tri hand flavor sinc g...


In [14]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Firas\AppData\Roaming\nltk_data...


True

In [15]:
#VADER Sentiment Scoring
analyzer = SentimentIntensityAnalyzer()

def vader_sentiment_score(text):
    return analyzer.polarity_scores(text)['compound']

df['SentimentScore'] = df['ProcessedText'].apply(vader_sentiment_score)


In [16]:
#Define sentiment labels based on VADER scores
def get_sentiment_label(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['SentimentScore'].apply(get_sentiment_label)

In [21]:
df=df.head(100000)

In [22]:
#Split the data into train and test sets
X = df['ProcessedText']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
#Feature Extraction - TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [None]:
#Model Selection - SVM and Logistic Regression
# SVM
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_pred = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_pred)
