In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Import data: Movie reviews

In [51]:
from datasets import load_dataset

# Download the IMDb dataset
imdb_dataset = load_dataset('imdb')

# Select 1,000 examples from each split (train and test)
data = pd.DataFrame(imdb_dataset['train'].shuffle(seed=42).select(range(20000)))

# replace target value
data['label'] = data['label'].apply(lambda x: 'pos' if x else 'neg')

data.head()

Unnamed: 0,text,label
0,There is no relation at all between Fortier an...,pos
1,This movie is a great. The plot is very true t...,pos
2,"George P. Cosmatos' ""Rambo: First Blood Part I...",neg
3,In the process of trying to establish the audi...,pos
4,"Yeh, I know -- you're quivering with excitemen...",neg


# Preprocessing: Clean the text data 

Preprocessing is a crucial step in NLP that involves cleaning and transforming raw text data into a format that is suitable for machine learning algorithms and other NLP tasks. It includes several essential steps:

1. **Tokenization**: Breaking down text into smaller units, such as words or subwords (tokens). Tokenization helps in understanding the structure of the text and is a fundamental step for many NLP tasks.

2. **Lowercasing**: Converting all text to lowercase to ensure uniformity and to avoid treating words with different cases as different entities.

3. **Removing Stopwords**: Eliminating common words (such as "the", "is", "and") that do not carry significant meaning and are unlikely to contribute to the analysis.

4. **Removing Punctuation**: Stripping text of punctuation marks, as they often don't provide valuable information for many NLP tasks.

5. **Normalization**: Standardizing text by applying techniques like lemmatization or stemming to reduce words to their base or root forms. This helps in treating different forms of words as the same token.

Preprocessing in NLP significantly impacts the quality and performance of downstream tasks. Effective preprocessing ensures that the data is clean, standardized, and ready for analysis or model training, ultimately enhancing the accuracy and reliability of NLP applications.


In [52]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Download necessary resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Lemmatizer and Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /Users/azagar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/azagar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/azagar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
# Function to preprocess text
def preprocess_text(text):
    # Tokenize the text into words
    words = word_tokenize(text.lower())  # Convert text to lowercase

    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Stemming (uncomment if you want to use stemming)
    stemmed_words = [stemmer.stem(word) for word in words]

    # Join the words back into a string
    preprocessed_text = ' '.join(lemmatized_words)
    return preprocessed_text

In [54]:
# Apply preprocessing
data['clean_text'] = data['text'].apply(preprocess_text)

In [55]:
# Check preprocessed first instance
data['clean_text'][0]

'relation fortier profiler fact police series violent crime profiler look crispy fortier look classic profiler plot quite simple fortier plot far complicated fortier look like prime suspect spot similarity main character weak weirdo clairvoyance people like compare judge evaluate enjoying funny thing people writing fortier look american hand arguing prefer american series maybe language spirit think series english american way actor really good funny acting superficial'

In [56]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['clean_text', 'text']], data['label'], test_size=0.2, random_state=42)

# Feature extraction 1: Convert text data to numerical features (TF-IDF or word embeddings)

TF-IDF stands for Term Frequency-Inverse Document Frequency, a numerical statistic used in natural language processing (NLP) to evaluate the importance of a word in a document within a corpus.

Here's a breakdown of how TF-IDF works:

1. **Term Frequency (TF):** It measures how often a term (word) appears in a document. It's calculated by dividing the number of times a term appears in a document by the total number of terms in that document. The idea is that the more frequent a term is in a document, the more important it might be.

   $ \[ \text{TF}(t, d) = \frac{\text{Number of times term } t \text{ appears in document } d}{\text{Total number of terms in document } d} \] $

2. **Inverse Document Frequency (IDF):** This part of the formula measures the significance of a term across a collection of documents (corpus). It penalizes the words that appear too frequently across documents and gives more weight to terms that are rare in the corpus. It's calculated as the logarithm of the ratio between the total number of documents and the number of documents containing the term, then adding 1 to avoid division by zero.

   $ \[ \text{IDF}(t, D) = \log{\left(\frac{\text{Total number of documents in corpus } D}{\text{Number of documents containing term } t}\right)} + 1 \] $

3. **TF-IDF:** This is the product of TF and IDF. It gives a high weight to terms that are frequent in a specific document but relatively rare in the entire corpus. Terms that occur frequently across all documents get lower weights.

   $ \[ \text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D) \] $

Using TF-IDF, you can represent each document as a numerical vector where each dimension represents a term and its importance in that document. This technique is widely used in information retrieval, text mining, and search engine optimization, helping to determine the relevance of a document to a query or to analyze the significance of terms within documents.

In [85]:
# Model building: Choose and train a classifier
vectorizer = TfidfVectorizer()  # Use TF-IDF vectorizer for text to numerical feature conversion
X_train_vec = vectorizer.fit_transform(X_train['clean_text'])
X_test_vec = vectorizer.transform(X_test['clean_text'])

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_vec, y_train)
logistic_predictions = logistic_model.predict(X_test_vec)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print("Logistic Regression Accuracy:", logistic_accuracy)

# Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_vec, y_train)
rf_predictions = rf_model.predict(X_test_vec)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

Logistic Regression Accuracy: 0.88625
Random Forest Accuracy: 0.84825


# Feature extraction 2: Convert text data to numerical features (Word embeddings)

Word2Vec is a popular technique in natural language processing (NLP) used to represent words as numerical vectors in a continuous vector space. It's based on the idea that words with similar meanings often appear together in similar contexts and therefore should have vector representations that are close to each other in this space.

There are two main architectures for Word2Vec:

1. **Continuous Bag-of-Words (CBOW):** This model predicts the probability of a word given its context. It takes a context of surrounding words and tries to predict the target word. For instance, given the context words "the cat sat on the," it predicts the target word "mat."

2. **Skip-gram:** This model works the other way around; it predicts the context words given a target word. So, given the target word "cat," it tries to predict the context words "the," "sat," "on," etc.

Both models use a shallow neural network with a single hidden layer to learn the weights that represent the words as vectors. The hidden layer's weights become the word embeddings, which are the vector representations of the words.

Training Word2Vec involves presenting pairs of words (input and output) to the network, adjusting the weights using techniques like backpropagation, and optimizing the network to minimize the prediction error. The resulting word vectors capture semantic relationships between words.

The key benefit of Word2Vec is that it produces dense, low-dimensional representations for words, capturing semantic meaning and relationships between words. These embeddings can be used in various NLP tasks like sentiment analysis, machine translation, and information retrieval.

Word2Vec's ability to represent words as continuous vectors has significantly contributed to the development of more effective NLP models and applications.

![image info](w2v.png)


In [59]:
from gensim.models import Word2Vec

# Tokenized text 
tokenized_train_text = [text.split() for text in X_train['clean_text']]
tokenized_test_text = [text.split() for text in X_test['clean_text']]

# Train Word2Vec model
w2v_model = Word2Vec(tokenized_train_text, vector_size=100, window=5, min_count=1, workers=4, epochs=10)

In [60]:
# Get all words and their vectors in the Word2Vec model's vocabulary
all_words = w2v_model.wv.index_to_key
word_vectors = {word: w2v_model.wv[word] for word in all_words[:5]}

# Print the word vectors
for word, vector in word_vectors.items():
    print(f"Word: {word}")
    print(f"Vector: {vector}")
    print("\n")  # Add a newline for better readability

Word: br
Vector: [ 1.21909343e-01  5.92590533e-02  1.05182815e+00  1.29842138e+00
  9.34469849e-02 -1.47313035e+00  1.22275090e+00  1.03286338e+00
 -1.27847719e+00  1.06472168e-02 -3.60007375e-01 -2.31586918e-01
 -9.25996006e-01  5.66162705e-01 -5.55866957e-01 -2.34780979e+00
  1.94273129e-01  3.67202789e-01 -1.62185669e+00 -5.09611130e-01
 -1.36729562e+00 -8.81744847e-02  1.29608643e+00  1.76797962e+00
  1.77838728e-01  3.60358059e-02  5.04395068e-01  2.03236029e-01
 -8.18773031e-01 -5.46439052e-01 -7.96662629e-01  1.32134646e-01
 -5.75748324e-01 -1.00471580e+00 -9.18747663e-01 -2.15081394e-01
  8.93931568e-01 -1.41501272e+00 -9.02254105e-01 -2.08485484e+00
 -5.78641653e-01  1.46873367e+00 -9.34582591e-01 -1.15586734e+00
 -7.60571361e-01 -2.05677152e-01 -1.19755685e+00 -1.85966734e-02
  3.01922321e-01  1.05158448e+00  1.06699383e+00  6.40259385e-01
  6.67065382e-01 -1.49239637e-02  8.37262213e-01 -1.18750989e+00
 -4.47285026e-01  1.05528045e+00 -4.04927969e-01 -1.22597562e-02
 -4.0637

In [35]:
# Assuming 'w2v_model' is your trained Word2Vec model

# Find similar words to a specific word
similar_words = w2v_model.wv.most_similar('car', topn=10)

# 'word' is the word for which you want to find similar words, and 'topn' specifies the number of similar words to retrieve

# Print the similar words and their similarity scores
for word, similarity in similar_words:
    print(f"Similar word: {word}, Similarity: {similarity}")


Similar word: explosion, Similarity: 0.6980080604553223
Similar word: crash, Similarity: 0.6879973411560059
Similar word: chase, Similarity: 0.6702556014060974
Similar word: driving, Similarity: 0.6673227548599243
Similar word: fire, Similarity: 0.6669660806655884
Similar word: bike, Similarity: 0.6648144721984863
Similar word: plane, Similarity: 0.6571458578109741
Similar word: helicopter, Similarity: 0.641711950302124
Similar word: truck, Similarity: 0.6379563808441162
Similar word: wheel, Similarity: 0.6322101950645447


In [84]:
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

# Prepare data (Replace this part with your w2v_model and all_words)
# Here, 'word_vectors' and 'all_words' are assumed to be pre-defined
all_words = w2v_model.wv.index_to_key
word_vectors = {word: w2v_model.wv[word] for word in all_words}

# Print the word vectors
words = []
vecs = []
for word, vector in word_vectors.items():
    words.append(word)
    vecs.append(vector)
vecs = np.array(vecs)

# Number of clusters
num_clusters = 5  # Change this to the number of clusters you want

# Clustering using KMeans
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(vecs)

# Reduce dimensions for visualization (PCA)
pca = PCA(n_components=2)
word_vecs_2d = pca.fit_transform(vecs)

# Choose a subset of words to plot
num_words_to_plot = 200
word_subset = words[:num_words_to_plot]
word_vecs_subset = np.array([word_vectors[word] for word in word_subset])
word_vecs_2d_subset = pca.transform(word_vecs_subset)
cluster_labels_subset = kmeans.predict(word_vecs_subset)

# Create DataFrame for Plotly
data = {
    'x': word_vecs_2d_subset[:, 0],
    'y': word_vecs_2d_subset[:, 1],
    'word': word_subset,
    'cluster': cluster_labels_subset
}
df = pd.DataFrame(data)

# Plotly scatter plot with hover text
fig = px.scatter(df, x='x', y='y', color='cluster', hover_data={'word': True, 'x': False, 'y': False},
                 title='Word2Vec Clusters (Subset of Words)', labels={'x': '', 'y': ''},
                 width=800, height=600)
fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.update_layout(showlegend=True)

fig.show()





In [36]:
# Function to get average Word2Vec representation for a sentence
def get_average_w2v(tokens):
    vector_sum = 0
    count = 0
    for word in tokens:
        if word in w2v_model.wv:
            vector_sum += w2v_model.wv[word]
            count += 1
    if count != 0:
        return vector_sum / count
    else:
        return [0] * 100  # Return zero vector if no word found

# Add Word2Vec representations to DataFrame
X_train_w2v = [get_average_w2v(text) for text in tokenized_train_text]
X_test_w2v = [get_average_w2v(text) for text in tokenized_test_text]

In [37]:
# Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_w2v, y_train)
logistic_predictions = logistic_model.predict(X_test_w2v)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print("Logistic Regression Accuracy:", logistic_accuracy)

# Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_w2v, y_train)
rf_predictions = rf_model.predict(X_test_w2v)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.8535
Random Forest Accuracy: 0.82775


# Using pre-trained models in NLP

Pre-trained models are fundamental in the field of natural language processing (NLP) due to their ability to capture linguistic patterns and relationships from vast amounts of text data. They serve as a starting point for various NLP tasks and are incredibly useful for several reasons:

1. **Generalization**: Pre-trained models are trained on large and diverse text corpora, enabling them to learn generalized representations of language. This allows them to perform reasonably well on a wide range of downstream tasks without task-specific fine-tuning.

2. **Resource Efficiency**: Leveraging pre-trained models saves computational resources and time. Instead of training models from scratch, which requires substantial data and computing power, users can benefit from these pre-existing, well-trained models.

3. **Transfer Learning**: Pre-trained models facilitate transfer learning, where knowledge learned from one task can be transferred to another related task. By fine-tuning or adapting pre-trained models on specific datasets or tasks, their performance can be significantly improved with minimal additional training.

Fine-tuning refers to the process of taking a pre-trained model and further training it on a specific dataset or task to adapt its parameters to perform better in that particular context. Here's why fine-tuning is valuable:

- **Task-specific Adaptation**: Fine-tuning allows the model to adapt to nuances and specific patterns within a target dataset or task, enhancing its performance on that particular task.

- **Improved Performance**: By fine-tuning on domain-specific or task-specific data, the model can learn more task-specific features, leading to improved accuracy and effectiveness for the intended application.

- **Reduced Data Requirement**: Fine-tuning often requires less data than training a model from scratch. By starting with a pre-trained model, it can efficiently learn from a smaller, domain-specific dataset, making it beneficial in scenarios where limited annotated data is available.

Both pre-trained models and fine-tuning play critical roles in NLP, enabling practitioners to leverage existing knowledge and adapt it to new tasks, domains, or languages, ultimately improving the performance and efficiency of NLP applications.


In [38]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis")

# Example text for sentiment analysis
text = "I absolutely love this product! It's fantastic!"

# Perform sentiment analysis using the pipeline
result = sentiment_analysis(text)

# Output the sentiment and confidence score
print(f"Sentiment: {result[0]['label']}, Confidence: {result[0]['score']:.4f}")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d3dcaa8d-781d-49af-88cd-2484291353e6)')' thrown while requesting HEAD https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json


Sentiment: POSITIVE, Confidence: 0.9999


In [43]:
from tqdm import tqdm

trans_y_pred = []
y_test_reset = y_test.reset_index(drop=True)
n = 500

for test_text in tqdm(X_test['text'][:n]):
    result = sentiment_analysis(test_text[:1500])
    sentiment = result[0]['label']
    trans_y_pred.append('pos' if sentiment == 'POSITIVE' else 'neg')

trans_accuracy = accuracy_score(y_test_reset[:n], trans_y_pred)
print("Transformer Accuracy:", trans_accuracy)

100%|██████████| 500/500 [02:17<00:00,  3.64it/s]

Transformer Accuracy: 0.886





# Stanza & POS tagging
Stanza is an NLP library developed by the Stanford NLP Group. It's designed for a wide range of natural language processing tasks, including tokenization, part-of-speech tagging, named entity recognition, dependency parsing, and more. Stanza aims to provide efficient and accurate pre-trained models for various languages.

Key features of Stanza include:
- **Pre-Trained Models**: Stanza comes with pre-trained models for multiple languages, allowing users to perform various NLP tasks without training models from scratch.
- **Ease of Use**: It offers a simple and intuitive API for performing different NLP tasks, making it accessible for both beginners and experienced researchers.
- **Accuracy**: Stanza models are known for their high accuracy in different NLP tasks due to their robust training on extensive datasets.
- **Multiple Languages**: Stanza supports multiple languages, making it suitable for multilingual NLP applications.

Stanza provides state-of-the-art performance in various NLP tasks and continues to evolve with advancements in the field of natural language processing.

### Use Case: Text Analysis with Universal POS Tagging using Stanza

Stanza's Universal POS tagging can be highly beneficial in various text analysis tasks. Let's consider a scenario where you have a dataset of customer reviews for a product. By utilizing Stanza's Universal POS tagging, you can perform the following analysis:

1. **Extracting Key Features**: Identify the key features or attributes of the product mentioned in the reviews by analyzing nouns (NOUN) and adjectives (ADJ) tagged using Stanza. This helps in understanding what aspects of the product are being praised or criticized.

2. **Sentiment Analysis**: Analyze sentiments associated with specific parts of speech. For instance, adjectives (ADJ) often reflect sentiments or opinions. By associating adjectives with their corresponding nouns, you can determine the sentiment expressed towards various product features.

3. **Customer Feedback Categorization**: Categorize customer feedback into different categories based on the identified parts of speech. For instance, categorize reviews mentioning "customer service" (PROPN) separately to analyze the sentiment specifically related to that aspect.

4. **Comparative Analysis**: Compare the frequency and sentiment of different parts of speech across different products or time frames to identify trends and patterns in customer opinions.

By utilizing Stanza's Universal POS tagging, you can effectively extract meaningful insights from textual data, enabling better decision-making and improving products or services based on customer feedback.

### Universal POS Tags
- **ADJ**: Adjective
- **ADP**: Adposition
- **ADV**: Adverb
- **AUX**: Auxiliary
- **CCONJ**: Coordinating conjunction
- **DET**: Determiner
- **INTJ**: Interjection
- **NOUN**: Noun
- **NUM**: Numeral
- **PART**: Particle
- **PRON**: Pronoun
- **PROPN**: Proper noun
- **PUNCT**: Punctuation
- **SCONJ**: Subordinating conjunction
- **SYM**: Symbol
- **VERB**: Verb
- **X**: Other

In [62]:
import stanza

# Download English model (change 'en' to the appropriate language code if needed)
stanza.download('en')

# Initialize the English pipeline
nlp = stanza.Pipeline('en', processors='tokenize,pos')

# Sample customer review
sample_review = "The camera quality is amazing, but the battery life could be better."

# Process the review
doc = nlp(sample_review)

# Extract nouns and adjectives
nouns = []
adjectives = []

for sentence in doc.sentences:
    for word in sentence.words:
        if word.upos == 'NOUN':
            nouns.append(word.text)
        elif word.upos == 'ADJ':
            adjectives.append(word.text)

# Print extracted nouns and adjectives
print("Extracted Nouns:", nouns)
print("Extracted Adjectives:", adjectives)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 2.39MB/s]                    
2023-11-16 11:04:08 INFO: Downloading default packages for language: en (English) ...
2023-11-16 11:04:10 INFO: File exists: /Users/azagar/stanza_resources/en/default.zip
2023-11-16 11:04:14 INFO: Finished downloading models and saved to /Users/azagar/stanza_resources.
2023-11-16 11:04:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 2.39MB/s]                    
2023-11-16 11:04:15 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

2023-11-16 11:04:15 

Extracted Nouns: ['camera', 'quality', 'battery', 'life']
Extracted Adjectives: ['amazing', 'better']
