<a href="https://colab.research.google.com/github/Shrutimadhuri/Sentiment-analysis-of-hindi-movie-reviews/blob/main/Sentiment_analysis_hindi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Text Preprocessing**

In [1]:
pip install pandas nltk scikit-learn Flask



In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/valid.csv')

**Tokenization and Script Validation**

In [4]:
def validate_script(text):
    # Remove any non-Devanagari characters
    return ''.join([char for char in text if re.match(r'[ऀ-ॿ]', char)])

train_df['clean_text'] = train_df['text'].apply(validate_script)
valid_df['clean_text'] = valid_df['text'].apply(validate_script)

print("First 10 rows after script validation:\n", train_df[['clean_text']].head(10))

First 10 rows after script validation:
                                           clean_text
0  चंद्रमोहनशर्माकोप्रड्यूसरऔरलीडऐक्टरअक्षयकुमारक...
1  अगरआपइसफिल्मकोदेखनेजारहेहैंतोसबसेपहलेतोआपयहजान...
2  बॉलीवुडवालेचोरीछिपेहॉलीवुडफिल्मोंसेकहानियांऔरद...
3  बैनरसंजयदत्तप्रोडक्शन्सप्रालिरुपालीओमएंटरटेनमे...
4  मेंघटितचर्चितनानावटीकांडमेंएकक्राइमथ्रिलरबननेक...
5  बैनरमिलेनियमफिल्म्सनिर्देशकपेट्रिकलुसिएरकलाकार...
6  चश्मेबद्दूरएकक्लासिकमूवीहैजिसेसईपरांजपेनेनिर्द...
7  दिल्लीकीएकपांचसिताराहोटलमेंडैनवरुणधवनहोटलमैनेज...
8  हसीनाएकघरेलूलड़कीथीलेकिनभाईकेभागजानेऔरपतिकीहत्...
9  जैसाकिफिल्मकेनामसेहीजाहिरहैकिहरबारकीतरहइसबारभी...


**Tokenization**

In [5]:
def tokenize(text):
    return word_tokenize(text)

train_df['tokens'] = train_df['clean_text'].apply(tokenize)
valid_df['tokens'] = valid_df['clean_text'].apply(tokenize)

print("First 10 rows after tokenization:\n", train_df[['tokens']].head(10))

First 10 rows after tokenization:
                                               tokens
0  [चंद्रमोहनशर्माकोप्रड्यूसरऔरलीडऐक्टरअक्षयकुमार...
1  [अगरआपइसफिल्मकोदेखनेजारहेहैंतोसबसेपहलेतोआपयहजा...
2  [बॉलीवुडवालेचोरीछिपेहॉलीवुडफिल्मोंसेकहानियांऔर...
3  [बैनरसंजयदत्तप्रोडक्शन्सप्रालिरुपालीओमएंटरटेनम...
4  [मेंघटितचर्चितनानावटीकांडमेंएकक्राइमथ्रिलरबनने...
5  [बैनरमिलेनियमफिल्म्सनिर्देशकपेट्रिकलुसिएरकलाका...
6  [चश्मेबद्दूरएकक्लासिकमूवीहैजिसेसईपरांजपेनेनिर्...
7  [दिल्लीकीएकपांचसिताराहोटलमेंडैनवरुणधवनहोटलमैने...
8  [हसीनाएकघरेलूलड़कीथीलेकिनभाईकेभागजानेऔरपतिकीहत...
9  [जैसाकिफिल्मकेनामसेहीजाहिरहैकिहरबारकीतरहइसबारभ...


**Stopword Removal**

In [6]:
# Custom list of Hindi stopwords
hindi_stop_words = ['और', 'के', 'का', 'है', 'में', 'से', 'को', 'पर', 'कि', 'हैं', 'इस', 'जो', 'तक', 'ही', 'ने', 'वह', 'था']

def remove_stopwords(tokens):
    return [word for word in tokens if word not in hindi_stop_words]

# Apply stopword removal on tokenized data
train_df['filtered_tokens'] = train_df['tokens'].apply(remove_stopwords)
valid_df['filtered_tokens'] = valid_df['tokens'].apply(remove_stopwords)

# Print first 10 rows after stopword removal
print("First 10 rows after stopword removal:\n", train_df[['filtered_tokens']].head(10))

First 10 rows after stopword removal:
                                      filtered_tokens
0  [चंद्रमोहनशर्माकोप्रड्यूसरऔरलीडऐक्टरअक्षयकुमार...
1  [अगरआपइसफिल्मकोदेखनेजारहेहैंतोसबसेपहलेतोआपयहजा...
2  [बॉलीवुडवालेचोरीछिपेहॉलीवुडफिल्मोंसेकहानियांऔर...
3  [बैनरसंजयदत्तप्रोडक्शन्सप्रालिरुपालीओमएंटरटेनम...
4  [मेंघटितचर्चितनानावटीकांडमेंएकक्राइमथ्रिलरबनने...
5  [बैनरमिलेनियमफिल्म्सनिर्देशकपेट्रिकलुसिएरकलाका...
6  [चश्मेबद्दूरएकक्लासिकमूवीहैजिसेसईपरांजपेनेनिर्...
7  [दिल्लीकीएकपांचसिताराहोटलमेंडैनवरुणधवनहोटलमैने...
8  [हसीनाएकघरेलूलड़कीथीलेकिनभाईकेभागजानेऔरपतिकीहत...
9  [जैसाकिफिल्मकेनामसेहीजाहिरहैकिहरबारकीतरहइसबारभ...


**Stemming and Lemmatization**

In [7]:
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Function to stem words using PorterStemmer
def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

# Apply stemming to the filtered tokens
train_df['stemmed_tokens'] = train_df['filtered_tokens'].apply(stem_words)
valid_df['stemmed_tokens'] = valid_df['filtered_tokens'].apply(stem_words)

# Print first 10 rows after stemming
print("First 10 rows after stemming:\n", train_df[['stemmed_tokens']].head(10))

First 10 rows after stemming:
                                       stemmed_tokens
0  [चंद्रमोहनशर्माकोप्रड्यूसरऔरलीडऐक्टरअक्षयकुमार...
1  [अगरआपइसफिल्मकोदेखनेजारहेहैंतोसबसेपहलेतोआपयहजा...
2  [बॉलीवुडवालेचोरीछिपेहॉलीवुडफिल्मोंसेकहानियांऔर...
3  [बैनरसंजयदत्तप्रोडक्शन्सप्रालिरुपालीओमएंटरटेनम...
4  [मेंघटितचर्चितनानावटीकांडमेंएकक्राइमथ्रिलरबनने...
5  [बैनरमिलेनियमफिल्म्सनिर्देशकपेट्रिकलुसिएरकलाका...
6  [चश्मेबद्दूरएकक्लासिकमूवीहैजिसेसईपरांजपेनेनिर्...
7  [दिल्लीकीएकपांचसिताराहोटलमेंडैनवरुणधवनहोटलमैने...
8  [हसीनाएकघरेलूलड़कीथीलेकिनभाईकेभागजानेऔरपतिकीहत...
9  [जैसाकिफिल्मकेनामसेहीजाहिरहैकिहरबारकीतरहइसबारभ...


# **Model Training**

In [12]:
print(data['experience'].value_counts())

experience
2    335
0    293
1    270
Name: count, dtype: int64


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import joblib

# Step 1: Load your dataset
# Assuming your dataset is in CSV format with 'text' and 'experience' columns
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/train.csv')

# Step 2: Preprocess the dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the input text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Step 3: Define training parameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Encode labels
label_encoder = LabelEncoder()
data['experience'] = label_encoder.fit_transform(data['experience'])

# Step 4: Create the DataLoader
train_dataset = SentimentDataset(
    texts=data['text'].to_numpy(),
    labels=data['experience'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Step 5: Load the BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_encoder.classes_))
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Step 6: Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{EPOCHS} - Loss: {loss.item()}')

# Step 7: Save the model and label encoder
torch.save(model.state_dict(), '/content/final_bert_model.pth')
joblib.dump(label_encoder, '/content/bert_label_encoder.joblib')

print("Model and label encoder saved successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Loss: 1.1025015115737915
Epoch 2/3 - Loss: 0.9939575791358948


In [33]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import joblib

# Step 1: Load the saved model and label encoder
model_path = '/content/final_bert_model.pth'
label_encoder_path = '/content/bert_label_encoder.joblib'

# Load the label encoder
label_encoder = joblib.load(label_encoder_path)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Load the BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_encoder.classes_))
model.load_state_dict(torch.load(model_path))
model.eval()  # Set the model to evaluation mode

# Step 2: Define a function to predict sentiment
def predict_sentiment(text):
    # Tokenize the input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Get the predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    sentiment = label_encoder.inverse_transform([predicted_class])[0]

    return sentiment

# Step 3: Get user input and predict sentiment
input_text = input("Enter a Hindi review: ")
sentiment = predict_sentiment(input_text)

# Step 4: Display the result
print(f"The sentiment of the review is: {sentiment}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))


Enter a Hindi review: यह उत्पाद बहुत अच्छा है। मुझे बहुत पसंद आया।
The sentiment of the review is: 2
