# Spacy

In this part, we will see how we can use spacy to perform sentiment analysis.

I have tried various ways to use spacy and compared the various accuracy values because there are not many hyperparameters to tune here.

First, we will see how we can use spacy for visualizations

In [None]:
#Import Modules
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path="/content/drive/MyDrive/519 Project/Code for Submission/Copy of Reviews(1).csv"

If any EOF error occurs, you can include the commented part. For some reason, I noticed EOF errors when the drive is not mounted. Mounting the drive and using that path helps.

In [None]:
food_reviews_df=pd.read_csv(path)#,quoting=3, error_bad_lines=False)
food_reviews_df.shape

In [None]:
food_reviews_df = food_reviews_df[['Text','Score']].dropna()

In [None]:
spacy_tok = spacy.load('en_core_web_sm')
sample_review=food_reviews_df.Text[54]
sample_review

In [None]:
parsed_review = spacy_tok(sample_review)
parsed_review

In [None]:
!wget https://raw.githubusercontent.com/tylerneylon/explacy/master/explacy.py

In [None]:
import explacy
explacy.print_parse_info(spacy_tok, 'The salad was surprisingly tasty.')

In [None]:
explacy.print_parse_info(spacy_tok,food_reviews_df.Text[0])

In [None]:
tokenized_text = pd.DataFrame()

for i, token in enumerate(parsed_review):
    tokenized_text.loc[i, 'text'] = token.text
    tokenized_text.loc[i, 'lemma'] = token.lemma_,
    tokenized_text.loc[i, 'pos'] = token.pos_
    tokenized_text.loc[i, 'tag'] = token.tag_
    tokenized_text.loc[i, 'dep'] = token.dep_
    tokenized_text.loc[i, 'shape'] = token.shape_
    tokenized_text.loc[i, 'is_alpha'] = token.is_alpha
    tokenized_text.loc[i, 'is_stop'] = token.is_stop
    tokenized_text.loc[i, 'is_punctuation'] = token.is_punct

tokenized_text[:20]

In [None]:
spacy.displacy.render(parsed_review, style='ent', jupyter=True)

In [None]:
sentence_spans = list(parsed_review.sents)
sentence_spans

In [None]:
displacy.render(parsed_review, style='dep', jupyter=True,options={'distance': 140})

In [None]:
options = {'compact': True, 'bg': 'violet','distance': 140,
           'color': 'white', 'font': 'Trebuchet MS'}
displacy.render(parsed_review, jupyter=True, style='dep', options=options)

In [None]:
noun_chunks_df = pd.DataFrame()

for i, chunk in enumerate(parsed_review.noun_chunks):
    noun_chunks_df.loc[i, 'text'] = chunk.text
    noun_chunks_df.loc[i, 'root'] = chunk.root,
    noun_chunks_df.loc[i, 'root.text'] = chunk.root.text,
    noun_chunks_df.loc[i, 'root.dep_'] = chunk.root.dep_
    noun_chunks_df.loc[i, 'root.head.text'] = chunk.root.head.text

noun_chunks_df[:20]

Now, we will look at creating various types of models for spacy.

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
df=pd.read_csv("/content/drive/MyDrive/Reviews(1).csv")#,quoting=3, error_bad_lines=False)
df_5=df[df["Score"]==5][:200]
df_4=df[df["Score"]==4][:200]
df_3=df[df["Score"]==3][:200]
df_2=df[df["Score"]==2][:200]
df_1=df[df["Score"]==1][:200]
df=pd.concat([df_5, df_4,df_3,df_2,df_1], axis=0)
df = df.sample(frac=1)

In [None]:
!python -m spacy download en_core_web_lg


I have written the pre-processing function for the texts

In [None]:
nlp = spacy.load("en_core_web_lg")
# define function to preprocess text
def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()
    # create doc object using spaCy model
    doc = nlp(text)
    # return doc
    # lemmatize and remove stop words
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    # join tokens back into string and return
    return ' '.join(tokens)

# apply preprocessing to text column in dataframe
df['clean_text'] = df['Text'].apply(preprocess_text)


# print first 5 rows of dataframe with cleaned text column
df[["Text","clean_text"]].head()

In [None]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.base import TransformerMixin 
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline 
import numpy as np

In [None]:
def clean_text(text):
  return preprocess_text(text)

# define custom transformers class
class Transformers(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [None]:
# split dataset
X = df['Text'].apply(clean_text)
y = df['Score']
y=y.apply(lambda x: 1 if x>=4 else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


Here, I have created a spacy tokenizer function to use as features. We extract tokens from the doc object obtained by using the nlp object on the text.

In [None]:
# define vectorizer and classifier
# vectorizer = CountVectorizer(tokenizer=lambda text: nlp(text), ngram_range=(1,1))
def spacy_tokenizer(text):
    return [token.text for token in nlp(text)]

vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))

classifier = LinearSVC()

# create pipeline
pipe = Pipeline([
    ('cleaner', Transformers()),
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

# train model
pipe.fit(X_train, y_train)

# save the vectorizer
vectorizer = pipe.named_steps['vectorizer']

# evaluate model
print("Accuracy:", pipe.score(X_test, y_test))


#Accuracy=0.676 (Before)
#Accuracy=0.736 (After)

Next Part

In [None]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score

df = pd.read_csv(path)#,quoting=3, error_bad_lines=False)
df_5=df[df["Score"]==5][:200]
df_4=df[df["Score"]==4][:200]
df_3=df[df["Score"]==3][:200]
df_2=df[df["Score"]==2][:200]
df_1=df[df["Score"]==1][:200]
df=pd.concat([df_5, df_4,df_3,df_2,df_1], axis=0)

# print(df.head(5))

nlp = spacy.load("en_core_web_lg")

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

df['clean_text'] = df['Text'].apply(preprocess_text)







Here, I tried to create a spacy vectorizer class which I use to create a feature space of text vectors obtained from the nlp object on the text.

In [None]:
from sklearn.base import BaseEstimator
from spacy.pipeline import Sentencizer

#Create a custom SpacyVectorizer transformer class that transforms the input text into spaCy's Doc vectors:
class SpacyVectorizer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [nlp(text).vector for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}




X = df['clean_text']
y = df['Score'].apply(lambda x:1 if x>=4 else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

vectorizer = SpacyVectorizer()
classifier = LinearSVC()

pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

pipe.fit(X_train, y_train)

print("Accuracy:", pipe.score(X_test, y_test))


#Accuracy=0.328(Before)
#0.704 (After)

Here, I created a Spacy embeddings class to try to create a feature space of embeddings

In [None]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score
import numpy as np

df = pd.read_csv(path)#,quoting=3, error_bad_lines=False)
df_5=df[df["Score"]==5][:200]
df_4=df[df["Score"]==4][:200]
df_3=df[df["Score"]==3][:200]
df_2=df[df["Score"]==2][:200]
df_1=df[df["Score"]==1][:200]
df=pd.concat([df_5, df_4,df_3,df_2,df_1], axis=0)

# print(df.head(5))

nlp = spacy.load("en_core_web_lg")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token for token in doc if not token.is_stop and not token.is_punct]
    return " ".join([token.lemma_.lower() for token in tokens])

df["Clean_Text"] = df["Text"].apply(preprocess_text)
df["Score"]=df["Score"].apply(lambda x:1 if x>=4 else 0)


# Define a custom transformer that uses spaCy to tokenize the text and extract word embeddings
class SpacyEmbeddings(TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = len(nlp("apple").vector)
    
    def transform(self, X, y=None):
        return np.array([self.nlp(text).vector for text in X])
    
    def fit(self, X, y=None):
        return self

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["Clean_Text"], df["Score"], test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ("embeddings", SpacyEmbeddings(nlp)),
    ("classifier", LinearSVC())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}".format(accuracy))


#Accuracy=0.330(Before)
#0.685(After)

Here, I am trying to use n-grams, more specifically unigrams and bigrams in my feature space

In [None]:
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Load the language model
nlp = spacy.load("en_core_web_lg")


X_train, X_test, y_train, y_test = train_test_split(df["Clean_Text"], df["Score"], test_size=0.2, random_state=42)

# Define a custom tokenizer using spacy
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))



#Accuracy=0.46(Before)
#0.78 (After)



n-grams attempt 2:

Another attempt at n-grams



In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load the data and select a subset of reviews for each score
df = pd.read_csv(path)#,quoting=3, error_bad_lines=False)
df_5 = df[df["Score"]==5][:4000]
df_4 = df[df["Score"]==4][:4000]
df_3 = df[df["Score"]==3][:4000]
df_2 = df[df["Score"]==2][:4000]
df_1 = df[df["Score"]==1][:4000]

# Concatenate the dataframes
df = pd.concat([df_5, df_4, df_3, df_2, df_1], axis=0)

# Define a function to preprocess the text using Spacy and generate n-grams
nlp = spacy.load('en_core_web_lg')
def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    n = 3
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(' '.join(tokens[i:i+n]))
    return ' '.join(ngrams)

# Preprocess the text
df['Text'] = df['Text'].apply(preprocess)
df["Score"]=df["Score"].apply(lambda x:1 if x>=4 else 0)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Score'], test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])#LinearSVC()

# Define the hyperparameters to tune
parameters = {
    'vectorizer__max_features': [1000, 5000, 10000],
    'vectorizer__ngram_range': [(1,1), (1,2), (2,2)],
    'classifier__alpha': [1, 0.1, 0.01, 0.001]
}

# Perform the grid search
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Print the classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
#Accuracy=0.44(Before)
#Accuracy=0.80 (After)

Trying to use TfidVectorizer in a different way

In [None]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

vectorizer = TfidfVectorizer(tokenizer=lambda text: nlp(text), ngram_range=(1, 1))
classifier = LinearSVC()

pipe = Pipeline([
    ('cleaner', Transformers()),
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

pipe.fit(X_train, y_train)
print("Accuracy:", pipe.score(X_test, y_test))



#Accuracy=0.604

# Simple Spacy Implementation

I have shown a very simple and straightforward spacy implementation that is easy to understand, which yielded the best accuracy.

The above was true at first before I tweaked my other attempt at n-grams a bit. I made similar changes to all my previous models and I have marked the "before" and "after" accuracies accordingly. 

Now, my best performing model is the one where I used n-grams with an accuracy of 0.79

In [None]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
df=pd.read_csv(path)#,quoting=3, error_bad_lines=False)
df_5=df[df["Score"]==5][:200]
df_4=df[df["Score"]==4][:200]
df_3=df[df["Score"]==3][:200]
df_2=df[df["Score"]==2][:200]
df_1=df[df["Score"]==1][:200]
df=pd.concat([df_5, df_4,df_3,df_2,df_1], axis=0)

In [None]:
#Load the large model
nlp = spacy.load("en_core_web_lg")

Preprocessing using the nlp object we created

In [None]:
# define function to preprocess text
def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()
    # create doc object using spaCy model
    doc = nlp(text)
    # return doc
    # lemmatize and remove stop words
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    # join tokens back into string and return
    return ' '.join(tokens)

# apply preprocessing to text column in dataframe
df['clean_text'] = df['Text'].apply(preprocess_text)

# print first 5 rows of dataframe with cleaned text column
df[["Text","clean_text"]].head()

We obtain the vectors using the nlp object as we split the dataset

In [None]:
# split dataset 
X = df['clean_text']
y = df['Score']
document=nlp.pipe(X)
text_vector=np.array([text.vector for text in document])
X=text_vector
y = y.apply(lambda x:1 if x>=4 else 0)
# y = y.apply(lambda x: 0 if x < 3 else (1 if x > 3 else x))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


We import and run the logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=LogisticRegression(max_iter=1000,C=10)
X_train.shape,y_train.shape
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)
print(model.score(X_test,y_test))

#Accuracy=0.76 