In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!pip install numpy
!pip install scikit-learn



In [6]:
# Import necessary libraries
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Define data paths
train_pos_dir = "/content/drive/MyDrive/acllmdb/aclImdb/train/pos"
train_neg_dir = "/content/drive/MyDrive/acllmdb/aclImdb/train/neg"
test_pos_dir = "/content/drive/MyDrive/acllmdb/aclImdb/test/pos"
test_neg_dir = "/content/drive/MyDrive/acllmdb/aclImdb/test/neg"

# Load data
def load_data(directory):
    data = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            data.append(file.read())
    return data

train_pos_data = load_data(train_pos_dir)
train_neg_data = load_data(train_neg_dir)
test_pos_data = load_data(test_pos_dir)
test_neg_data = load_data(test_neg_dir)

# Create labels
train_labels = np.concatenate((np.ones(len(train_pos_data)), np.zeros(len(train_neg_data))))
test_labels = np.concatenate((np.ones(len(test_pos_data)), np.zeros(len(test_neg_data))))

# Combine positive and negative data
train_data = train_pos_data + train_neg_data
test_data = test_pos_data + test_neg_data

KeyboardInterrupt: ignored

In [None]:
import pandas as pd
column_names=["reviews"]

In [None]:
train_pos_df = pd.DataFrame(train_pos_data, columns = column_names)
train_neg_df = pd.DataFrame(train_neg_data,columns = column_names)
test_pos_df = pd.DataFrame(test_pos_data,columns = column_names)
test_neg_df = pd.DataFrame(test_neg_data,columns = column_names)

In [None]:
train_pos_df.insert(1,"sentiment",np.ones(len(train_pos_df), dtype=np.int8))
train_neg_df.insert(1,"sentiment",np.zeros(len(train_pos_df), dtype=np.int8))
test_pos_df.insert(1,"sentiment",np.ones(len(train_pos_df), dtype=np.int8))
test_neg_df.insert(1,"sentiment",np.zeros(len(train_pos_df), dtype=np.int8))

In [None]:
train_data_collected = [train_pos_df, train_neg_df]
train_data_df = pd.concat(train_data_collected, ignore_index=True)
train_data_df.index += 1
display(train_data_df)

In [None]:
test_data_collected = [test_pos_df, test_neg_df]
test_data_df = pd.concat(test_data_collected, ignore_index=True)
test_data_df.index += 1
display(test_data_df)

In [None]:
concatenated_df = pd.concat([train_data_df, test_data_df], ignore_index=True )
concatenated_df.index += 1
concatenated_df

In [None]:
train_data_shuffled = train_data_df.sample(frac=1,random_state = 1,).reset_index()
test_data_shuffled = test_data_df.sample(frac=1,random_state = 1,).reset_index()

In [None]:
train_data_shuffled

In [None]:
test_data_shuffled

In [None]:
train_data_small_df = train_data_shuffled[:1000]
test_data_small_df = test_data_shuffled[:1000]

In [None]:
train_data_small_df

In [None]:
test_data_small_df

In [None]:
concatenated_small_df = pd.concat([train_data_small_df, test_data_small_df], ignore_index=True )
concatenated_small_df.index += 1
concatenated_small_df

In [None]:
train_reviews=train_data_small_df["reviews"]
train_sentiments=train_data_small_df["sentiment"]
test_reviews=test_data_small_df["reviews"]
test_sentiments=test_data_small_df["sentiment"]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

In [None]:
!pip install nltk
!pip install seaborn
!pip install matplotlib
!pip install beautifulsoup4
!pip install spacy
!pip install WordCloud
!pip install textblob

In [None]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from tqdm import tqdm

In [None]:
nltk.download('stopwords')

In [None]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
train_data_small_df['reviews']=train_data_small_df['reviews'].apply(denoise_text)
test_data_small_df['reviews']=test_data_small_df['reviews'].apply(denoise_text)

In [None]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
train_data_small_df['reviews']=train_data_small_df['reviews'].apply(remove_special_characters)
test_data_small_df['reviews']=test_data_small_df['reviews'].apply(remove_special_characters)

In [None]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
train_data_small_df['reviews']=train_data_small_df['reviews'].apply(simple_stemmer)
test_data_small_df['reviews']=test_data_small_df['reviews'].apply(simple_stemmer)

In [None]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
#Apply function on review column
train_data_small_df['reviews']=train_data_small_df['reviews'].apply(remove_stopwords)
test_data_small_df['reviews']=test_data_small_df['reviews'].apply(remove_stopwords)

In [None]:
norm_train_reviews= train_data_small_df["reviews"]
norm_test_reviews=train_data_small_df["reviews"]

In [None]:
norm_train_reviews[1]

In [None]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

In [None]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

In [None]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(concatenated_small_df['sentiment'])
print(sentiment_data.shape)

In [None]:
train_sentiments=sentiment_data[:1000]
test_sentiments=sentiment_data[:1000]

In [None]:
print(test_sentiments)

In [None]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments.ravel())
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments.ravel())
print(lr_tfidf)

In [None]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

In [None]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

In [None]:
#Classification report for bag of words
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

In [None]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

In [None]:
import nltk
nltk.download('punkt')

In [9]:
import os
import nltk
import re
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Step 1: Load and organize the dataset
train_pos_dir = "/content/drive/MyDrive/acllmdb/aclImdb/train/"
train_neg_dir = "/content/drive/MyDrive/acllmdb/aclImdb/train/"
test_pos_dir = "/content/drive/MyDrive/acllmdb/aclImdb/test/"
test_neg_dir = "/content/drive/MyDrive/acllmdb/aclImdb/test/"
positive_train_files = [os.path.join(train_pos_dir, 'pos', f) for f in os.listdir(os.path.join(train_pos_dir, 'pos'))]
negative_train_files = [os.path.join(train_neg_dir, 'neg', f) for f in os.listdir(os.path.join(train_neg_dir, 'neg'))]
positive_test_files = [os.path.join(test_pos_dir, 'pos', f) for f in os.listdir(os.path.join(test_pos_dir, 'pos'))]
negative_test_files = [os.path.join(test_neg_dir, 'neg', f) for f in os.listdir(os.path.join(test_neg_dir, 'neg'))]

# Step 2: Data Preprocessing
def preprocess_text(text):
    text = strip_html(text)
    text = text.lower()
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub('\[[^]]*\]', '', text)
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Load and preprocess training data
X_train = []
y_train = []
for file_path in positive_train_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        review = f.read()
        X_train.append(preprocess_text(review))
        y_train.append(1)  # Positive sentiment label
for file_path in negative_train_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        review = f.read()
        X_train.append(preprocess_text(review))
        y_train.append(0)  # Negative sentiment label

# Load and preprocess test data
X_test = []
y_test = []
for file_path in positive_test_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        review = f.read()
        X_test.append(preprocess_text(review))
        y_test.append(1)  # Positive sentiment label
for file_path in negative_test_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        review = f.read()
        X_test.append(preprocess_text(review))
        y_test.append(0)  # Negative sentiment label

# Step 3: Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 4: Training
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

# Step 5: Evaluation
y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


Accuracy: 0.88
Precision: 0.88
Recall: 0.88
F1-score: 0.88


In [None]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# define dataset
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_tfidf, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.svm import SVC
classifier = SVC()
classifier.fit(X_train_tfidf, y_train)

# Step 5: Evaluation
y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

In [None]:
# example of grid searching key hyperparametres for SVC
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_tfidf, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))