In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

In [2]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

In [3]:
data=pd.read_csv("data.csv")
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


## Stemmer

In [4]:
#to get stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# to get wordnet data to get meaningful data
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:

nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
# we will use stemmer and lemmatizer for converting the text data into vector form of data
#then we will use two algorithm to train and get the scores

In [8]:
ps = PorterStemmer()
corpus = []

In [9]:
# this is used to get dataset into vectors using PorterStemmer
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['Sentence'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [11]:
datamapping={ "negative":0,"neutral":1, "positive": 2  }
y=data['Sentiment'].map(datamapping)

In [12]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [13]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics 

MNB_clf = MultinomialNB()
MNB_clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred=MNB_clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.6668568168853394


In [14]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)
cm

array([[134,  83,  66],
       [138, 697, 105],
       [ 55, 137, 338]], dtype=int64)

In [15]:
# Training model using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

RFC_clf = RandomForestClassifier(n_estimators = 100) 

RFC_clf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred = RFC_clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.6457501426126641


In [16]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)
cm

array([[ 46, 182,  55],
       [129, 746,  65],
       [ 27, 163, 340]], dtype=int64)

## Lemmatizer

In [17]:
ltz = WordNetLemmatizer()
corpus = []

In [18]:
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['Sentence'][i])
    review = review.lower()
    review = review.split()
    
    review = [ltz.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [19]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [20]:
datamapping={ "negative":0,"neutral":1, "positive": 2  }
y=data['Sentiment'].map(datamapping)

In [21]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [22]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics 

MNB_clf = MultinomialNB()
MNB_clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred=MNB_clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.6651454649172847


In [23]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)
cm

array([[125,  91,  67],
       [140, 700, 100],
       [ 54, 135, 341]], dtype=int64)

In [24]:
# Training model using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

RFC_clf = RandomForestClassifier(n_estimators = 100) 

RFC_clf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred = RFC_clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.6383342840844267


In [25]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)
cm

array([[ 44, 177,  62],
       [128, 740,  72],
       [ 24, 171, 335]], dtype=int64)