In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.6 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from transformers import pipeline, AutoTokenizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from wordcloud import WordCloud

# Load the dataset
df = pd.read_csv('barbie_Cleaned.csv')

# Remove non-numeric ratings and convert to integers
df = df[df['rating'].str.isnumeric()]
df['rating'] = df['rating'].astype(int)

# Tokenize the reviews and remove stop words
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
words = [word.lower() for review in df['text'] for word in word_tokenize(review)
         if word.isalpha() and word not in stop_words and len(word) > 2]
word_counts = Counter(words)
common_words = word_counts.most_common(10)

# Load sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Tokenizer and functions for chunking and sentiment analysis
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
def chunk_text(text, max_length=510):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_length)
    return [tokenizer.decode(chunk) for chunk in [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]]

def apply_sentiment_analysis(text):
    chunks = chunk_text(text)
    sentiments = [sentiment_analysis(chunk)[0] for chunk in chunks]
    avg_score = sum(sentiment['score'] for sentiment in sentiments) / len(sentiments)
    return 'POSITIVE' if avg_score >= 0.5 else 'NEGATIVE'

# Apply sentiment analysis to each review
df['sentiment'] = df['text'].apply(apply_sentiment_analysis)


# Preprocess text for topic modeling
def preprocess(text):
    return [token for token in gensim.utils.simple_preprocess(text) if token not in STOPWORDS and len(token) > 3]

df['processed_text'] = df['text'].map(preprocess)
dictionary = gensim.corpora.Dictionary(df['processed_text'])
max_freq = 0.5
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
bow_corpus = [dictionary.doc2bow(doc) for doc in df['processed_text']]

# Train the LDA model
num_topics = 10
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=2)

# Wordcloud for each topic
for i in range(num_topics):
    topic = lda_model.show_topic(i)
    word_freq = {word: freq for word, freq in topic}
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Topic {i} Word Cloud')
    plt.show()

# Named Entity Recognition (NER) using transformers pipeline
ner = pipeline("ner", grouped_entities=True, device=-1)
df['entities'] = df['text'].apply(ner)

# Plot most common entity groups
all_entities = [entity['entity_group'] for entities in df['entities'] for entity in entities]
entity_counts = Counter(all_entities)
common_entities = entity_counts.most_common(10)
plt.bar(*zip(*common_entities))
plt.xlabel('Entity Group')
plt.ylabel('Count')
plt.title('Most Common Named Entity Groups')
plt.show()

# Text summarization using transformers pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

def summarize_review(review):
    max_length = min(max(int(len(review.split()) * 0.8), 20), 512)
    min_length = min(max(int(max_length * 0.4), 10), 128)
    summary = summarizer(
        review,
        min_length=min_length,
        max_length=max_length,
        do_sample=False,
        truncation=True
    )
    return summary[0]['summary_text']

df['summary'] = df['text'].apply(summarize_review)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
