In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
amazon_df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/amazon_data.csv')

In [4]:
#amazon_df=amazon_df[:20]

In [5]:
amazon_df.head()

Unnamed: 0,Review
0,Pros:\n- Pairs and switches well with multiple...
1,Sound quality is okay for this price. They are...
2,"Easy to operate, good product quality, very sa..."
3,"Honestamente, el sonido que emite el audífono ..."
4,I was hoping for it to blow my head off but it...


In [6]:
amazon_df.shape

(20011, 1)

**pre-processing**

In [7]:
pip install langdetect



In [8]:
pip install nltk



In [9]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Needed for WordNetLemmatizer

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
# Ensure consistency in language detection results
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

def clean_text(review):
    # Remove URLs
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    # Remove HTML tags
    review = re.sub(r'<.*?>', '', review)
    # Remove special characters and digits
    review = re.sub(r'\d+', '', review)
    review = re.sub(r'[^A-Za-z\s]+', '', review)  # Keep only alphabets and spaces
    return review

def remove_stopwords(review):
    return ' '.join([word for word in review.split() if word not in stop_words])

def lemmatize_text(review):
    return ' '.join([lemmatizer.lemmatize(word) for word in review.split()])

def preprocess_review(review):
    if isinstance(review, str):  # Ensure the review is a valid string
        # Detect language, and only process if it's English
        if detect_language(review) != 'en':
            return None
        # Apply cleaning, stopword removal, and lemmatization
        review = clean_text(review)
        review = review.lower()
        review = remove_stopwords(review)
        review = lemmatize_text(review)
        return review.strip() if len(review.strip()) > 0 else None
    return None

# Example DataFrame (replace with your actual amazon_df DataFrame)
# amazon_df = pd.DataFrame({'Reviews': ['This product is great!', 'Este producto es excelente.']})

# Apply preprocessing to the Reviews column
amazon_df['Cleaned_Reviews'] = amazon_df['Review'].apply(preprocess_review)

# Remove rows where Cleaned_Reviews is None (indicating non-English or empty reviews)
amazon_df = amazon_df.dropna(subset=['Cleaned_Reviews'])

In [11]:
amazon_df.shape

(16695, 2)

In [12]:
amazon_df.head()

Unnamed: 0,Review,Cleaned_Reviews
0,Pros:\n- Pairs and switches well with multiple...,pro pair switch well multiple device android p...
1,Sound quality is okay for this price. They are...,sound quality okay price light comfortable sma...
2,"Easy to operate, good product quality, very sa...",easy operate good product quality satisfied
4,I was hoping for it to blow my head off but it...,hoping blow head still attached pretty quiet e...
5,"Despite not being Apple airpods, these earbuds...",despite apple airpods earbuds trick responsive...


**zero-short classification for pseudo-labelling**

In [13]:
pip install transformers datasets



In [14]:
import pandas as pd
from transformers import pipeline
from datasets import Dataset

# Step 1: Initialize the Zero-Shot Classification model on GPU (if available)
import torch
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

# Load zero-shot classification pipeline with the model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Step 2: Load your data into a Hugging Face Dataset (assume amazon_df already exists)
# For example, your data frame might look like this:
# amazon_df = pd.DataFrame({'Cleaned_Reviews': ['great product', 'worst experience', 'just okay', 'love it', 'hate it']})

# Convert DataFrame to Hugging Face Dataset for efficient batch processing
dataset = Dataset.from_pandas(amazon_df[['Cleaned_Reviews']])

# Step 3: Define candidate labels for sentiment analysis
candidate_labels = ['positive', 'negative', 'neutral']

# Step 4: Function to classify each review using Zero-Shot Classification
def classify_batch(batch):
    result = classifier(batch['Cleaned_Reviews'], candidate_labels)
    # Get the label with the highest score for each review
    return {'Pseudo_Labels': [res['labels'][0] for res in result]}

# Step 5: Apply the classification in batches using the map function
dataset = dataset.map(classify_batch, batched=True, batch_size=16)  # Adjust batch size based on your GPU memory

# Step 6: Convert back to DataFrame (if you need it as a DataFrame again)
classified_df = dataset.to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/16695 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [15]:
classified_df.head()

Unnamed: 0,Cleaned_Reviews,__index_level_0__,Pseudo_Labels
0,pro pair switch well multiple device android p...,0,positive
1,sound quality okay price light comfortable sma...,1,negative
2,easy operate good product quality satisfied,2,positive
3,hoping blow head still attached pretty quiet e...,4,positive
4,despite apple airpods earbuds trick responsive...,5,positive


In [16]:
classified_df[['Cleaned_Reviews','Pseudo_Labels']].to_csv('data_cleaned.csv',index=False)