# Vectorizer Tuning

In [2]:
import pandas as pd

data = pd.read_pickle("reviews_3")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [3]:
import string
def remove_punct(text):
    text = "".join([word for word in text if word not in string.punctuation])
    return text.lower()

data['clean_text'] = data['reviews'].apply(lambda x: remove_punct(x))
data['clean_text']

0       plot  two teen couples go to a church party  d...
1       the happy bastards quick movie review \ndamn t...
2       it is movies like these that make a jaded movi...
3         quest for camelot  is warner bros   first fe...
4       synopsis  a mentally unstable man undergoing p...
                              ...                        
1995    wow  what a movie  \nits everything a movie ca...
1996    richard gere can be a commanding actor  but he...
1997    glorystarring matthew broderick  denzel washin...
1998    steven spielbergs second epic film on world wa...
1999    truman   trueman   burbank is the perfect name...
Name: clean_text, Length: 2000, dtype: object

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create Pipeline

pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Set parameters to search (model and vectorizer)

param_grid = {
    'vectorizer__ngram_range': [(1, 1), (2, 2), (3, 3)],
    'classifier__alpha': [0.1, 0.5, 1.0]
}

# Perform grid search on pipeline
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=1, scoring= "accuracy", refit=True)
grid.fit(data['clean_text'], data['target'])

# Print best parameters and score
print("Best parameters: ", grid.best_params_)
print("Best score: ", grid.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters:  {'classifier__alpha': 0.1, 'vectorizer__ngram_range': (2, 2)}
Best score:  0.8400000000000001


In [7]:
import requests
import json

# Azure Text Analytics API endpoint and key
endpoint = "https://azure-ml-ai900-william-31012023.cognitiveservices.azure.com/"
key = "a82304f95ac14b63b28ed755ca46acb9"

# Texts to extract topics from
texts = data['reviews']
new_text = ['new text']

# Call the Text Analytics API
def extract_topics_azure(texts, endpoint, key):
    topics = []
    documents = {"documents": [{"id": i, "text": text} for i, text in enumerate(texts)]}
    headers = {
        'Ocp-Apim-Subscription-Key': key,
        'Content-Type': 'application/json'
    }
    response = requests.post(endpoint, headers=headers, json=documents)
    if response.status_code == 200:
        response_json = response.json()
        for document in response_json['documents']:
            topics.append([topic['topic'][:10] for topic in document['topics']])
    return topics

# Extract topics using Azure Cognitive Services
azure_topics = extract_topics_azure(texts + new_text, endpoint, key)

# Print extracted topics using Azure Cognitive Services
print("Azure topics: ", azure_topics)

# Extract topics using Latent Dirichlet Allocation (LDA)
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Transform texts to a Bag-of-Words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

# Train an LDA model
lda = LatentDirichletAllocation(n_components=3, learning_method='online', random_state=0)
lda.fit(X)

# Vectorize the new text
new_text_vectorized = vectorizer.transform(new_text)

# Predict the topic of the new text
topic_distribution = lda.transform(new_text_vectorized)
print("LDA topic distribution: ", topic_distribution)
lda_topic = topic_distribution.argmax()
print("LDA topic: ", lda_topic)

# Compare the results of the two models
print("Azure vs LDA: ", azure_topics[0], lda_topic)

Azure topics:  []
LDA topic distribution:  [[0.29022329 0.59743036 0.11234635]]
LDA topic:  1


IndexError: list index out of range

⚠️ Please push the exercise once you are done 🙃

## 🏁 