In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing the training data
imdb_data=pd.read_csv('/kaggle/input/preprocessing/train.csv')
imdb_data_test=pd.read_csv('/kaggle/input/preprocessing/test.csv')
print(imdb_data.shape)
imdb_data.head(10)

In [2]:
pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download necessary NLTK data
nltk.download('stopwords')

# Data Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    stemmer = SnowballStemmer('english')
    tokens = text.split()
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')]
    return ' '.join(filtered_tokens)

# Assuming 'data.csv' is your dataset file with columns ['review', 'label']
train_df = pd.read_csv('/kaggle/input/preprocessing/train.csv')
test_df = pd.read_csv('/kaggle/input/preprocessing/test.csv')



# Preprocess reviews
train_df['processed_review'] = train_df['review'].apply(preprocess_text)
test_df['processed_review'] = test_df['review'].apply(preprocess_text)

# Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# Apply VADER to get sentiment scores for both datasets
train_df['vader_score'] = train_df['processed_review'].apply(lambda review: analyzer.polarity_scores(review)['compound'])
test_df['vader_score'] = test_df['processed_review'].apply(lambda review: analyzer.polarity_scores(review)['compound'])

# Prepare features and labels for the training set
X_train = train_df['processed_review']
y_train = train_df['sentiment']
X_test = test_df['processed_review']
y_test = test_df['sentiment']

# Vectorize the text
tfidf_vectorizer = TfidfVectorizer(max_features=50000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Add VADER scores as a feature
X_train_vader = train_df['vader_score'].values.reshape(-1, 1)
X_test_vader = test_df['vader_score'].values.reshape(-1, 1)

# Combine TF-IDF features with VADER scores
X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_vader))
X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_vader))

# Train the model on the training set
model = RandomForestClassifier()
model.fit(X_train_combined, y_train)

# Predict on the test set
predictions = model.predict(X_test_combined)

# Evaluate and print the accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test set:", accuracy)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Accuracy on test set: 0.8533


In [None]:
from transformers import DistilBertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset
def tokenize_data(reviews, labels, max_length):
    input_ids = []
    attention_masks = []

    for review in reviews:
        encoded_data = tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

# Assuming imdb_data['review'] and imdb_data_test['review'] are your datasets
# and imdb_data['sentiment'], imdb_data_test['sentiment'] are the labels
# Convert labels to 0 and 1
train_labels = [1 if label == 'positive' else 0 for label in imdb_data['sentiment']]
test_labels = [1 if label == 'positive' else 0 for label in imdb_data_test['sentiment']]

# Tokenize training and validation datasets
max_length = 256 # You can adjust this
train_input_ids, train_attention_masks, train_labels = tokenize_data(imdb_data['review'], train_labels, max_length)
test_input_ids, test_attention_masks, test_labels = tokenize_data(imdb_data_test['review'], test_labels, max_length)

# Create DataLoader
batch_size = 16 # Adjust based on your GPU memory

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [None]:
from transformers import DistilBertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import random

# Load DistilBERT pre-trained model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels = 2, # Binary classification (positive, negative)
    output_attentions = False,
    output_hidden_states = False,
)

# Ensure model is running on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer and scheduler for training
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10 # Number of training epochs, adjust as needed
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps
)


In [None]:
from sklearn.metrics import accuracy_score

# Function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch_i in range(0, epochs):
    # Training
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        
        outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

    # Evaluation
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    print(f"Accuracy: {eval_accuracy/nb_eval_steps}")
