In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
df = pd.read_csv('/content/CHATGPT.csv')

In [None]:
df.head()

In [None]:
df.iloc[0]['Review']

In [None]:
df['label'].unique()

In [None]:
df['label'] = df['label'].apply(lambda x: 0 if x.lower() == 'negative' else 1 )

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df['label'].value_counts()

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stop_words]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [None]:
df['cleaned_text'] = df['Review'].apply(stemming)

In [None]:
# df['Review'] = df['Review'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [None]:
# df['cleaned_text'] = df['Review'].apply(lambda x: ' '.join([word for word in word_tokenize(x.lower()) if word not in stop_words]))

In [None]:
df['cleaned_text'][0]

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Create and train logistic regression model
model = SVC(probability = True, C=1, kernel='linear', gamma='auto', class_weight = 'balanced')
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")



In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = SVC(probability=True, class_weight='balanced')

best_fold = -1
best_accuracy = -1
fold_number = 1
best_model = None

accuracies = []
print(kf.split(X))
for train_index, test_index in kf.split(X):
  X_train_fold, X_test_fold = X[train_index], X[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]

  model.fit(X_train_fold, y_train_fold)

  y_pred = model.predict(X_test_fold)

  accuracy = accuracy_score(y_test_fold, y_pred)
  accuracies.append(accuracy)

  if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_fold = fold_number
    best_model = model

  print(f"Fold {fold_number} accuracy: {accuracy:.4f}")
  fold_number += 1

print(f"\nBest performing fold: {best_fold}")
print(f"Best fold accuracy: {best_accuracy:.4f}")

In [None]:
import pickle

# Assuming `model` is your trained SVM model and `vectorizer` is the TfidfVectorizer
with open('sentiment_model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)


# **Confusion matrix**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test_fold, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Negative', 'Positive'], yticklabels = ['Negative', 'Positive'])
plt.title('Confusion matrix')
plt.xlabel('Prediction label')
plt.ylabel('True labels')
plt.show()


# **ROC Curve**

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0,1], color='red', linestyle='--')
plt.title('Reviever Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.legend()
plt.show()