In [1]:
import spacy
#Downloading and importing the baseline model required
import subprocess
#%%
print(subprocess.getoutput("python -m spacy download en_core_web_lg"))
nlp = spacy.load('en_core_web_lg')

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
You should consider upgrading via the '/opt/python/envs/default/bin/python -m pip install --upgrade pip' command.
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
# import important modules
import numpy as np
import pandas as pd

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    plot_confusion_matrix,
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# text preprocessing modules
from string import punctuation 

# text preprocessing modules
from nltk.tokenize import word_tokenize

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression

# Download dependency
for dependency in (
    "brown",
    "names",
    "wordnet",
    "averaged_perceptron_tagger",
    "universal_tagset",
    "stopwords",
    "omw-1.4"
):
    nltk.download(dependency)
    
import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

[nltk_data] Downloading package brown to /home/datalore/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package names to /home/datalore/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package wordnet to /home/datalore/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/datalore/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/datalore/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datalore/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /home/datalore/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [3]:
# load data
data = pd.read_csv("/data/notebook_files/labeledTrainData.tsv", sep='\t')

In [4]:
# show top five rows of data
data.head() 

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
# check the shape of the data
data.shape

(25000, 3)

In [6]:
# evalute news sentiment distribution
data['sentiment'].value_counts()

In [17]:
stop_words =  stopwords.words('english')

def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    doc = nlp(text) 
    # Remove punctuation from text
    text = [token.text for token in doc if token.is_alpha is False]
    text = " ".join(text)
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = [token.text for token in doc if token.is_stop is False]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words: 
        lemmatized_words = [token.lemma_ for token in doc]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [8]:
#clean the review
data["cleaned_review"] = data["review"].apply(text_cleaning)

In [9]:
#split features and target from  data 
X = data["cleaned_review"]
y = data.sentiment.values

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [11]:
# Create a classifier in pipeline
sentiment_classifier = Pipeline(steps=[
                                 ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('naive_bayes',MultinomialNB())
                                 ])

In [12]:
# train the sentiment classifier 

sentiment_classifier.fit(X_train,y_train)

Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('naive_bayes', MultinomialNB())])

In [13]:
# test model performance on valid data 
y_preds = sentiment_classifier.predict(X_valid)

In [14]:
accuracy_score(y_valid,y_preds)

0.8626666666666667

In [15]:
#save model 
import joblib 

joblib.dump(sentiment_classifier, '/data/notebook_files/sentiment_model_pipeline.pkl')

['/data/notebook_files/sentiment_model_pipeline.pkl']