**FINETUNED VERSION OF ROBERTA GIVING THE BEST ACCURACY**


In [1]:
import numpy as np 
import pandas as pd


#importing the training data
imdb_data=pd.read_csv('/kaggle/input/preprocessing/train.csv')
imdb_data_test=pd.read_csv('/kaggle/input/preprocessing/test.csv')
print(imdb_data.shape)

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score
import time
from torch.nn.functional import softmax

# Assuming you have defined the tokenize_data function appropriately for RoBERTa
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_data(reviews, labels, max_length):
    input_ids = []
    attention_masks = []

    for review in reviews:
        encoded_data = tokenizer.encode_plus(
            review,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=max_length,   # Max length to truncate/pad
            padding='max_length',    # Pad sentence to max length
            truncation=True,         # Truncate to max_length if longer
            return_attention_mask=True,  # Generate attention mask
            return_tensors='pt',     # Return PyTorch tensors
        )
        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

# Assuming 'imdb_data' and 'imdb_data_test' are your dataframes and 
# 'review' is the column with text data, 'sentiment' is the label column
# Convert labels to 0 and 1
train_labels = [1 if label == 'positive' else 0 for label in imdb_data['sentiment']]
test_labels = [1 if label == 'positive' else 0 for label in imdb_data_test['sentiment']]

# Tokenize the dataset
# Note: Ensure your tokenize_data function is updated for RoBERTa if necessary
train_input_ids, train_attention_masks, train_labels = tokenize_data(imdb_data['review'], train_labels, 256)
test_input_ids, test_attention_masks, test_labels = tokenize_data(imdb_data_test['review'], test_labels, 256)

# Create DataLoader
batch_size = 16 # Adjust based on your GPU memory
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.cuda()  # Ensure you are using a GPU if available

# Prepare optimizer and schedule (linear warm-up and decay)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



# Training and evaluation loop with timing
total_start_time = time.time()

for epoch_i in range(0, epochs):
    print(f"Starting epoch {epoch_i+1}/{epochs}")
    
    # Training
    start_time = time.time()
    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to('cuda') for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = time.time() - start_time
    print(f"  Average training loss: {avg_train_loss}")
    print(f"  Training epoch took: {training_time}s")

    # Evaluation
    start_time = time.time()
    model.eval()
all_roberta_probs = []

for batch in test_dataloader:
    batch = tuple(t.to('cuda') for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Apply softmax to logits to get probabilities
    probs = softmax(outputs.logits, dim=1)
    all_roberta_probs.append(probs.detach().cpu().numpy())

# Concatenate all batch probabilities into a single array
roberta_test_probs = np.concatenate(all_roberta_probs, axis=0)


(30000, 2)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/2
  Average training loss: 0.24734761804379524
  Training epoch took: 1362.1350030899048s
Starting epoch 2/2
  Average training loss: 0.14990481061711908
  Training epoch took: 1371.8480942249298s


**BEST ML MODEL: XGBOOST**


In [2]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
print(os.listdir("../input"))
import warnings
warnings.filterwarnings('ignore')

['preprocessing']


In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [4]:
#importing the training data
imdb_data=pd.read_csv('/kaggle/input/preprocessing/train.csv')
imdb_data_test=pd.read_csv('/kaggle/input/preprocessing/test.csv')
print(imdb_data.shape)


def preprocess_text(text, method='stemming'):
    # Define the tokenizer
    tokenizer = ToktokTokenizer()
    
    # Define stopword list
    stopword_list = nltk.corpus.stopwords.words('english')
    
    # Removing HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    # Removing text inside square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # Define function for removing special characters
    def remove_special_characters(text, remove_digits=True):
        pattern = r'[^a-zA-z0-9\s]'
        text = re.sub(pattern, '', text)
        return text
    
    # Remove special characters
    text = remove_special_characters(text)
    
    # Tokenization
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    # Stemming or Lemmatization
    if method == 'stemming':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    elif method == 'lemmatization':
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Removing the stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    
    # Reconstruct the text
    text = ' '.join(filtered_tokens)
    
    return text

# Example usage
method = 'lemmatization' # or 'stemming'
imdb_data['review'] = imdb_data['review'].apply(lambda x: preprocess_text(x, method=method))
imdb_data_test['review'] = imdb_data_test['review'].apply(lambda x: preprocess_text(x, method=method))




#Count vectorizer for bag of words
cv=CountVectorizer(max_features = 60000, ngram_range = (1,3), max_df = 0.8, min_df = 3)
#transformed train reviews
cv_train_reviews=cv.fit_transform(imdb_data['review'])
cv_test_reviews=cv.transform(imdb_data_test['review'])


print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)





lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
sentiment_data_test=lb.transform(imdb_data_test['sentiment'])
print(sentiment_data.shape)
print(sentiment_data_test.shape)
#Spliting the sentiment data
sentiment_data=sentiment_data[:30000]
sentiment_data_test=sentiment_data_test[:20000]

print(sentiment_data)
print(sentiment_data_test)


from xgboost import XGBClassifier
import time

# Initialize the XGBoost classifier model with the given hyperparameters
xgb = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=7,
    min_child_weight=1,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss'
)

# Fit the XGBoost model for bag of words
print("Training XGBoost model...")
start_train_time = time.time()
xgb_bow = xgb.fit(cv_train_reviews, sentiment_data)
end_train_time = time.time()
print("Training completed. Time taken to train the model: {:.4f} seconds".format(end_train_time - start_train_time))

# Predict probabilities on the test set
start_predict_time = time.time()
xgb_bow_probs = xgb_bow.predict_proba(cv_test_reviews)
end_predict_time = time.time()
print("Prediction completed. Time taken to predict: {:.4f} seconds".format(end_predict_time - start_predict_time))




(30000, 2)
BOW_cv_train: (30000, 60000)
BOW_cv_test: (20000, 60000)
(30000, 1)
(20000, 1)
[[0]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[[1]
 [1]
 [0]
 ...
 [0]
 [1]
 [0]]
Training XGBoost model...
Training completed. Time taken to train the model: 93.8842 seconds
Prediction completed. Time taken to predict: 0.9630 seconds


In [5]:
combined_probs = np.concatenate((roberta_test_probs, xgb_bow_probs), axis=1)
from sklearn.linear_model import LogisticRegression

# Split your combined_probs into a training and test/validation set if necessary
# For demonstration, assuming combined_probs is ready for training and you have corresponding labels
meta_classifier = LogisticRegression()
meta_classifier.fit(combined_probs, sentiment_data_test)
meta_predictions = meta_classifier.predict(combined_probs)  # Predict classes
meta_probabilities = meta_classifier.predict_proba(combined_probs)  # Predict probabilities

# Assuming you have the true labels for your test set in test_labels
from sklearn.metrics import accuracy_score
meta_accuracy = accuracy_score(sentiment_data_test, meta_predictions)
print(f"Meta-classifier Accuracy: {meta_accuracy}")

Meta-classifier Accuracy: 0.94115
