# Loading the Dataset

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Reviews.csv')

# Display basic information about the dataset
print(data.info())

# Display the first few rows of the "Text" column to understand what kind of data we're dealing with
print(data['Text'].head())


# Preprocessing the Datset

Removing HTML tages, converting to lowercase, tokenize text and Removing special characters.

In [None]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Download the necessary NLTK data
nltk.download('punkt')

# Function to clean and tokenize text, then convert back to a cleaned string
def clean_and_tokenize_to_string(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Convert to lower case
    text = text.lower().strip()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove special characters and digits from each token
    tokens = [re.sub(r'[^a-zA-Z\s]', '', token) for token in tokens]
    # Remove empty tokens
    tokens = [token for token in tokens if token]
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text


data['clean_text'] = data['Text'].apply(clean_and_tokenize_to_string)

# Display the first few entries of the cleaned text
print(data['clean_text'].head())


Removing Stop words 

In [None]:
import nltk
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('stopwords')

# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Assuming data['clean_text'] is loaded
data['no_stopwords'] = data['clean_text'].apply(remove_stopwords)

# Display the first few entries without stopwords
print(data['no_stopwords'].head())


Stemming

In [None]:
from nltk.stem import PorterStemmer
import nltk

# Initialize stemmer
stemmer = PorterStemmer()

# Function to apply stemming
def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Assuming data['no_stopwords'] is loaded
data['stemmed_text'] = data['no_stopwords'].apply(stem_text)

# Display the first few entries of the stemmed text
print(data['stemmed_text'].head())


# Displaying first 5 Rows after preprocessing

In [None]:
data.head()

# Splitting the Dataset and Apply Vectorization.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

data['Sentiment'] = data['Score'].apply(lambda x: 1 if x > 3 else 0)

# Assuming 'stemmed_text' is already in your dataset from the preprocessing
X_train, X_test, y_train, y_test = train_test_split(data['stemmed_text'], data['Sentiment'], test_size=0.2, random_state=42)

# Initialize and apply TF-IDF Vectorizer with n-grams
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # This includes both unigrams and bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



# Performing SVM model

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDClassifier

svm_model = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss='hinge'))

svm_model.fit(X_train_tfidf, y_train)

y_pred = svm_model.predict(X_test_tfidf)

accuracy_svm = accuracy_score(y_test, y_pred)

print(f'Accuracy of the SVM model: {accuracy_svm}')


# Classification Report

In [None]:
report = classification_report(y_test, y_pred)
print(report)

# Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Classification Report

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
print(df_report)

# Cross-Validation for 5 fold

In [None]:
from sklearn.model_selection import cross_validate

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_results = cross_validate(svm_model, X_train_tfidf, y_train, cv=5, scoring=scoring)
df_cv_results = pd.DataFrame(cv_results)
print(df_cv_results)

# Predicting one review

In [None]:
def predict_sentiment_svm(review):
  
    review_cleaned = clean_and_tokenize_to_string(review) 
    review_vectorized = vectorizer.transform([review_cleaned])
    prediction = svm_model.predict(review_vectorized)
    return 'Positive' if prediction[0] == 1 else 'Negative'

# Example usage
new_review = "This product was great! Very helpful and works perfectly."
print(predict_sentiment_svm(new_review))

# Confusion Metrics Plot

In [None]:
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

# Roc

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

y_scores = svm_model.decision_function(X_test_tfidf)

fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# ROC-PR curve

In [None]:
from sklearn.metrics import precision_recall_curve, auc

precision, recall, _ = precision_recall_curve(y_test, y_scores)
auc_pr = auc(recall, precision)

plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve (area = %0.2f)' % auc_pr)
plt.fill_between(recall, precision, alpha=0.2, color='blue')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()


# Parameter tuning 

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'sgdclassifier__alpha': [0.0001, 0.001, 0.01],
}

grid_search = GridSearchCV(svm_model, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_tfidf, y_train)

results = grid_search.cv_results_
df_results = pd.DataFrame(results)

important_columns = ['param_sgdclassifier__alpha', 'mean_test_score', 'std_test_score']
print(df_results[important_columns])

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the SVM model: {accuracy}')
report = classification_report(y_test, y_pred)
print(report)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Cross-Validation 

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(svm_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')

mean_cv_score = cross_val_scores.mean()
std_cv_score = cross_val_scores.std()

print(f"Mean cross-validation accuracy: {mean_cv_score:.4f}")
print(f"Standard deviation of cross-validation accuracy: {std_cv_score:.4f}")

# Displaying Sentiment Scores

In [None]:
data["Sentiment"]

In [None]:
def probability_to_rating(prob_positive):
    return round(1 + 4 * prob_positive)

In [None]:
def map_sentiment_to_stars(prob_positive):
    if prob_positive > 0.9:
        return 5
    elif prob_positive > 0.75:
        return 4
    elif prob_positive > 0.55:
        return 3
    elif prob_positive > 0.3:
        return 2
    else:
        return 1

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split


svm_pipeline = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss='hinge'))

calibrated_clf = CalibratedClassifierCV(svm_pipeline, method='sigmoid', cv=5)

calibrated_clf.fit(X_train_tfidf, y_train)

y_prob = calibrated_clf.predict_proba(X_test_tfidf)
print("Predicted probabilities:\n", y_prob)

y_pred = calibrated_clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the SVM model: {accuracy}')

report = classification_report(y_test, y_pred)
print(report)

cm = confusion_matrix(y_test, y_pred)
print(cm)

# Predicting rating from 1 to 5 for text.

In [None]:

for i, probs in enumerate(y_prob):
    positive_prob = probs[1]
    star_rating = map_sentiment_to_stars(positive_prob)
    print(f"Review: {X_test.iloc[i]}")
    print(f"Star Rating: {star_rating} stars")
    print()

In [None]:

data_tfidf = vectorizer.transform(data['stemmed_text']) 
y_prob_all = calibrated_clf.predict_proba(data_tfidf)

data['Star Rating'] = [map_sentiment_to_stars(prob[1]) for prob in y_prob_all]

Displaying text and rating

In [None]:
print(data[['stemmed_text', 'Star Rating']])

Plotting Rating vs reviews.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))

sns.countplot(x='Star Rating', data=data, palette='viridis') 

plt.title('Distribution of Star Ratings')
plt.xlabel('Star Ratings')
plt.ylabel('Frequency')


plt.show()

In [None]:
data.head()