In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv
/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv
/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv
/kaggle/input/sentiment-analysis-dataset-imdb-and-amazon/amazon_cells_labelled.txt
/kaggle/input/sentiment-analysis-dataset-imdb-and-amazon/imdb_labelled.txt
/kaggle/input/sentiment-analysis-dataset-imdb-and-amazon/yelp_labelled.txt


In [3]:
import spacy
from spacy import displacy

In [4]:
nlp = spacy.load('en_core_web_sm')
text = "Apple, This is first sentence. and Google this is another one. here 3rd one is"
doc = nlp(text)
doc

Apple, This is first sentence. and Google this is another one. here 3rd one is

In [5]:
for token in doc:
    print(token)

Apple
,
This
is
first
sentence
.
and
Google
this
is
another
one
.
here
3rd
one
is


In [6]:
# sent = nlp.create_pipe('sentencizer')
nlp.add_pipe("sentencizer", before="parser")
doc = nlp(text)
for sent in doc.sents:
    print(sent)

Apple, This is first sentence.
and Google this is another one.
here 3rd one is


In [7]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(stopwords)

['because', 'two', 'an', 'any', 'even', 'it', 'not', "'s", 'thereafter', 'ourselves', 'after', 'alone', 'my', 'anyone', 'full', 'while', 'hers', 'you', 'seem', 'get', 'have', 'all', 'namely', 'behind', 'can', 'yet', 'serious', 'by', '’ve', 'up', 'which', 'hence', 'himself', 'fifty', 'for', 'onto', 'became', 'myself', 'say', '‘s', 'through', 'to', 'again', 'with', 'various', 'wherever', 'back', 'regarding', "'d", 'less', 'hereupon', 'last', 'our', 'once', 'below', 'into', 'now', '‘m', 'call', 'where', 'per', 'side', 'do', 'they', 'used', 'across', 'else', 'could', 'such', 'from', 'of', 'whereupon', 'thus', 'against', 'nothing', 'along', 'eight', 'fifteen', 'without', 'enough', 'this', 'although', 'become', 'almost', 'afterwards', 'same', 'take', 'anything', 'thereby', 'since', '‘d', 'whence', 'are', 'much', 'us', 'whither', 'empty', 'seemed', 'third', 'next', 'them', 'before', 'these', 'those', 'still', 'eleven', 'his', "'ll", 'nowhere', 'move', 'only', 're', 'is', 'out', 'further', 'th

In [8]:
len(stopwords)

326

In [9]:
for token in doc:
    if token.is_stop == False:
        print(token)

Apple
,
sentence
.
Google
.
3rd


# lemmatization

In [10]:
doc = nlp('run runs running runner')
for lem in doc:
    print(lem.text, lem.lemma_)

run run
runs run
running run
runner runner


# Part of Speech (POS)
* DET means determiner
* AUX means auxiliary
* ADJ means adverb
* ADP means adposition
* NOUN as the name suggests means common noun
* PUNCT means punctuation

In [11]:
doc = nlp('All is well at your end!')
for token in doc:
    print(token.text, token.pos_)

All PRON
is AUX
well ADV
at ADP
your PRON
end NOUN
! PUNCT


In [12]:
## displacy visualizes dependencies and entities
displacy.render(doc, style = 'dep')

# Entity Detection
* GPE means Countries, cities, states.
* DATE means Absolute or relative dates or periods.
* CARDINAL means Numerals that do not fall under another type.
* PERSON means People, including fictional.
* NORP means Nationalities or religious or political groups.
* MONEY means Monetary values, including unit.
and many more ...

In [13]:
doc=nlp("Apple Inc. is planning to open a new store in New York City next month. The tech giant announced its ambitious plan during the latest press conference held in Cupertino. Tim Cook, the CEO of Apple, mentioned that the new store will feature the latest products and services offered by the company. The store's location will be in the heart of Manhattan, near Central Park. This move indicates Apple's continued expansion efforts in the retail sector. Analysts speculate that the new store will attract a significant number of customers, further solidifying Apple's presence in the city. Additionally, the company aims to provide a unique shopping experience for consumers, incorporating innovative design elements into the store layout. Overall, Apple's decision to open a flagship store in New York City underscores its commitment to delivering exceptional products and services to its customers.")


In [14]:
doc

Apple Inc. is planning to open a new store in New York City next month. The tech giant announced its ambitious plan during the latest press conference held in Cupertino. Tim Cook, the CEO of Apple, mentioned that the new store will feature the latest products and services offered by the company. The store's location will be in the heart of Manhattan, near Central Park. This move indicates Apple's continued expansion efforts in the retail sector. Analysts speculate that the new store will attract a significant number of customers, further solidifying Apple's presence in the city. Additionally, the company aims to provide a unique shopping experience for consumers, incorporating innovative design elements into the store layout. Overall, Apple's decision to open a flagship store in New York City underscores its commitment to delivering exceptional products and services to its customers.

In [15]:
displacy.render(doc, style = 'ent')

# Text Classification

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [17]:
data_yelp = pd.read_csv('/kaggle/input/sentiment-analysis-dataset-imdb-and-amazon/yelp_labelled.txt', sep='\t', header = None)
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [18]:
columns_name = ['Review', 'Sentiment']
data_yelp.columns = columns_name
data_yelp.head()


Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [19]:
data_yelp.shape

(1000, 2)

In [20]:
data_amazon = pd.read_csv('/kaggle/input/sentiment-analysis-dataset-imdb-and-amazon/amazon_cells_labelled.txt', sep = '\t', header = None)
data_amazon.columns = columns_name
data_amazon.head()


Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [21]:
data_amazon.shape

(1000, 2)

In [22]:
data_imdb = pd.read_csv('/kaggle/input/sentiment-analysis-dataset-imdb-and-amazon/imdb_labelled.txt', sep = '\t', header = None)
data_imdb.columns = columns_name
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [23]:
data_imdb.shape

(748, 2)

In [24]:
data_imdb_2 = pd.read_csv('/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv')
data_imdb_2=data_imdb_2.iloc[:10000]
data_imdb_2.columns = columns_name
data_imdb_2.head()
data_imdb_2.shape

(10000, 2)

In [25]:
data_imdb_2['Sentiment'].value_counts()

Sentiment
0    5043
1    4957
Name: count, dtype: int64

In [26]:
data = pd.concat([data_yelp, data_amazon, data_imdb,data_imdb_2], ignore_index=True)
data.shape

(12748, 2)

In [27]:
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [28]:
data['Sentiment'].value_counts()

Sentiment
0    6405
1    6343
Name: count, dtype: int64

In [29]:
data.isnull().sum() ### checking null value

Review       0
Sentiment    0
dtype: int64

# Cleaning - Tokenization and Lemmatization

In [30]:
import string
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

text_data_cleaning("     ! ! ! my name is Ujjwal BAranwal and i like machine Learning")

['ujjwal', 'baranwal', 'like', 'machine', 'learning']

# Vectorization Feature Engineering (TF-IDF)

In [32]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier = LinearSVC()

In [33]:
X = data['Review']
y = data['Sentiment']

In [34]:
# splliting dataset into 80% train data 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((10198,), (2550,))

In [35]:
X_train

8794     I've seen this movie at least fifty times and ...
8449     I knew I was going to see a low budget movie, ...
765                                      Tasted like dirt.
8490     I first saw this movie as a teenager when it c...
10365    The word honor should be erased from the vocab...
                               ...                        
11964    The greatest sin in life is being dull, and th...
5191     Chris Gerolmo took care not to simply give us ...
5390     One year after 'Love Thy Neighbour' made its I...
860      This place is pretty good, nice little vibe in...
7270     In my years of attending film festivals, I hav...
Name: Review, Length: 10198, dtype: object

In [36]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
clf.fit(X_train, y_train)



In [37]:
# predict the label for x_test
y_pred = clf.predict(X_test)
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8466666666666667


In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.84      1254
           1       0.85      0.85      0.85      1296

    accuracy                           0.85      2550
   macro avg       0.85      0.85      0.85      2550
weighted avg       0.85      0.85      0.85      2550



**confusion_matrix()** computes confusion matrix to evaluate the accuracy of a classification. By definition a confusion matrix C is such that C(i,j) is equal to the number of observations known to be in group i and predicted to be in group j. Thus in binary classification, the count of true negatives is C(0,0), false negatives is C(1,0), true positives is C(1,1) and false positives is C(0,1).

In [39]:
confusion_matrix(y_test, y_pred)

array([[1053,  201],
       [ 190, 1106]])

# Testing our Model

In [40]:
clf.predict(['Wow, this is amazing lesson'])

array([1])

In [41]:
clf.predict(['Wow,this suck'])

array([0])

In [42]:
clf.predict(['Ujjwal is a good boy'])

array([1])

In [43]:
clf.predict(["i love dancing"])

array([1])

In [44]:
clf.predict(['i hate to do gibbersih task'])

array([0])

In [45]:
clf.predict(['i hate the person who hate the wolf'])

array([0])

In [46]:
clf.predict(['i am a good person'])

array([1])

In [47]:
clf.predict(['i like this movie'])

array([1])

In [48]:
clf.predict(['not good as much as previous'])

array([1])

# making pickle file of clf and tfidf 

In [49]:
import pickle
pickle.dump(clf,open('clf.pkl','wb'))
pickle.dump(tfidf,open('tfidf.pkl','wb'))