# **Importing Libraries**

In [None]:
!pip install datasets



In [None]:
import re
import nltk
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**loading the IMDB dataset**

In [None]:
data = load_dataset('imdb')
train_data = data['train']
test_data = data['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train = [x['text'] for x in train_data]
test = [x['text'] for x in test_data]
review = train + test
sentiment = [x['label'] for x in train_data] + [x['label'] for x in test_data]

# **Preprocessing the imdb data in a single function**

In [None]:
stopwords_list = set(stopwords.words('english'))

tag_re = re.compile(r"<[^>]+>")

def remove_tags(text):
    return tag_re.sub('', text)

def preprocess(text):
    text = text.lower()

    # Remove HTML tags
    text = remove_tags(text)

    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove single characters
    text = re.sub(r"\b[a-zA-Z]\b", '', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stopwords_list]

    return ' '.join(words)

**Applying the preprocess function at imdb text data**

In [None]:
x = []
for i in review:
  x.append(preprocess(i))

# **Lemmatizing the text**

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize
lematizer = WordNetLemmatizer()
def lematize(text):
  token = word_tokenize(text)
  lematized = [lematizer.lemmatize(word) for word in token]
  return ' '.join(lematized)

In [None]:
x_lematized = []
for i in x:
  x_lematized.append(lematize(i))

**Using Term Frequency-Inverse Document Frequency (TF-IDF) converting the text into data**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(x_lematized)

In [None]:
for i in range(5):
  print(x[i])
  print('_'*1000)

  (0, 147352)	0.06887165744892833
  (0, 40261)	0.43163979086357285
  (0, 191879)	0.05103366923832112
  (0, 170093)	0.06341512274416986
  (0, 36485)	0.0915987965354543
  (0, 173807)	0.08213501373585433
  (0, 64729)	0.06237970433627787
  (0, 146763)	0.05638557253809167
  (0, 5489)	0.03089083548910607
  (0, 79493)	0.05438797765084294
  (0, 157091)	0.1178250011423481
  (0, 40410)	0.09316446837947953
  (0, 57540)	0.034517189411346454
  (0, 184650)	0.058776008635458406
  (0, 55739)	0.07623589830465599
  (0, 37765)	0.05643781951133665
  (0, 178763)	0.06949279056901347
  (0, 60877)	0.042142107624914556
  (0, 63298)	0.08036879938586203
  (0, 35913)	0.13075712669574147
  (0, 36480)	0.07984120637884981
  (0, 145020)	0.08560638910590448
  (0, 156655)	0.02731785790963409
  (0, 119242)	0.12967365581816623
  (0, 136334)	0.067683669217308
  :	:
  (0, 93972)	0.04926885352501424
  (0, 66583)	0.07789364112799925
  (0, 63981)	0.10843783114239668
  (0, 34260)	0.10514179186104686
  (0, 63702)	0.057751311774

In [None]:
y=sentiment

**Splitting the processed data into train test split for model training**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

**Training the model using LogisticRegression model**

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("Precision: ",precision_score(y_test,y_pred))
print("Recall: ",recall_score(y_test,y_pred))
print("F1 Score: ",f1_score(y_test,y_pred))

Accuracy:  0.8912
Precision:  0.880449792858552
Recall:  0.9025278058645096
F1 Score:  0.8913521070501298


**Tuning Hyperparameters of logisticregression model**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "max_iter": [100, 200, 300]
}
random = RandomizedSearchCV(lr, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
random.fit(x_train,y_train)

best_model = random.best_estimator_
y_pred = best_model.predict(x_test)
# Evaluate the model
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')


print(f"Best Parameters: {random.best_params_}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'max_iter': 200, 'C': 10}
Accuracy: 0.8945
Precision: 0.8947
Recall: 0.8945


**Training the model using naive bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(x_train,y_train)
y_pred=nb.predict(x_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("Precision: ",precision_score(y_test,y_pred))
print("Recall: ",recall_score(y_test,y_pred))
print("F1 Score: ",f1_score(y_test,y_pred))

Accuracy:  0.8672
Precision:  0.8741982205669356
Recall:  0.8543983822042467
F1 Score:  0.8641849048885253


**Tuning Hyperparameters of Naive bayes model**

In [None]:
params={
        "alpha": [0.1,1.0,10.0]
    }
random = RandomizedSearchCV(nb, params, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
random.fit(x_train,y_train)

best_model = random.best_estimator_
y_pred = best_model.predict(x_test)
# Evaluate the model
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')


print(f"Best Parameters: {random.best_params_}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")



Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Parameters: {'alpha': 1.0}
Accuracy: 0.8672
Precision: 0.8674
Recall: 0.8672


**Traing the model using support vector machine**

here lots of data in x_train and y_train takes more time to train the support vector machine model so i reduced the train size for faster processing anyhow the accuracy of svc is as approximately equal to the other two models

**Verifying the samples are evenly distributed**

In [None]:
count=0
for i in y_train_svc:
  if i==1:
    count+=1
print(count)
print(len(y_train_svc)-count)

2026
1974


In [None]:
x_train_svc, x_test_svc, y_train_svc, y_test_svc = train_test_split(x_train, y_train, test_size=0.9, random_state=42)

In [None]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(x_train_svc,y_train_svc)
y_pred=svc.predict(x_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("Precision: ",precision_score(y_test,y_pred))
print("Recall: ",recall_score(y_test,y_pred))
print("F1 Score: ",f1_score(y_test,y_pred))

Accuracy:  0.8551
Precision:  0.8246656760772659
Recall:  0.897876643073812
F1 Score:  0.8597153645076968


**Tuning hyperparameters for Support vector machine model**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grids ={
     "C": [0.1, 1, 10],
      "kernel": ['linear', 'rbf'],
      "gamma": ['scale', 'auto']
    }
random = RandomizedSearchCV(svc, param_grids, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
random.fit(x_train_svc,y_train_svc)

best_model = random.best_estimator_
y_pred = best_model.predict(x_test)
# Evaluate the model
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')


print(f"Best Parameters: {random.best_params_}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'kernel': 'linear', 'gamma': 'scale', 'C': 1}
Accuracy: 0.8574
Precision: 0.8584
Recall: 0.8574
